diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..ba3b78ffc90f4889d578ef6854100cc0caad0af2 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-1060/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-1272/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-212/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-424/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-636/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-848/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..711528861bef41425b7b6c141a9c78ebbd92a3e1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,146 @@
+---
+library_name: peft
+license: llama3.3
+base_model: meta-llama/Llama-3.3-70B-Instruct
+tags:
+- generated_from_trainer
+datasets:
+- ugaoo/subset_each5k_multimedqa
+model-index:
+- name: out/subset_each5k_multimedqa
+ results: []
+---
+
+
+
+[
](https://github.com/axolotl-ai-cloud/axolotl)
+See axolotl config
+
+axolotl version: `0.8.0.dev0`
+```yaml
+base_model: meta-llama/Llama-3.3-70B-Instruct
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+ - path: ugaoo/subset_each5k_multimedqa
+ type: alpaca
+val_set_size: 0
+output_dir: ./out/subset_each5k_multimedqa
+
+sequence_len: 4000
+sample_packing: true
+pad_to_sequence_len: true
+
+adapter: qlora
+lora_r: 256
+lora_alpha: 512
+lora_dropout: 0.05
+lora_target_linear: true
+lora_target_modules:
+ - q_proj
+ - k_proj
+ - v_proj
+ - o_proj
+ - up_proj
+ - down_proj
+ - gate_proj
+lora_modules_to_save:
+ - embed_tokens
+ - lm_head
+
+wandb_project: cosmosearch
+wandb_entity:
+wandb_watch:
+wandb_name: subset_each5k_multimedqa_llama33
+wandb_log_model:
+
+gradient_accumulation_steps: 3
+micro_batch_size: 4
+num_epochs: 6
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 5e-6
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 100
+evals_per_epoch: 6
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+save_total_limit: 6
+special_tokens:
+ pad_token: <|end_of_text|>
+
+```
+
+
+
+# out/subset_each5k_multimedqa
+
+This model is a fine-tuned version of [meta-llama/Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) on the ugaoo/subset_each5k_multimedqa dataset.
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 5e-06
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 3
+- gradient_accumulation_steps: 3
+- total_train_batch_size: 36
+- total_eval_batch_size: 12
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 100
+- num_epochs: 6.0
+
+### Training results
+
+
+
+### Framework versions
+
+- PEFT 0.15.0
+- Transformers 4.49.0
+- Pytorch 2.5.1+cu124
+- Datasets 3.4.1
+- Tokenizers 0.21.1
\ No newline at end of file
diff --git a/adapter_config.json b/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc930e1be2d901773c96d6e6d186c72676cbf328
--- /dev/null
+++ b/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "up_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "q_proj",
+ "k_proj",
+ "down_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/adapter_model.safetensors b/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fc5b9d288686bcfc0dd8fef5f40baf5e7a82badf
--- /dev/null
+++ b/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:daf3ced3bd8b21263fefde6234932a6f73d3a1191d93694a7382d35b17c0be53
+size 10829849744
diff --git a/checkpoint-1060/README.md b/checkpoint-1060/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b184114a0c28ed3e4c082c18486736dc818166d
--- /dev/null
+++ b/checkpoint-1060/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.3-70B-Instruct
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-1060/adapter_config.json b/checkpoint-1060/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc930e1be2d901773c96d6e6d186c72676cbf328
--- /dev/null
+++ b/checkpoint-1060/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "up_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "q_proj",
+ "k_proj",
+ "down_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1060/adapter_model.safetensors b/checkpoint-1060/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c6cec6c4d1de17b273f352d0f24996609e971aa5
--- /dev/null
+++ b/checkpoint-1060/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a42ff00e3b0e305d427a60e76e55db126591aa29ad33a748d4a76475661a4f5e
+size 10829849744
diff --git a/checkpoint-1060/global_step1061/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-1060/global_step1061/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2b881ca904c8cfaef246e01bdf94db6edc057fe2
--- /dev/null
+++ b/checkpoint-1060/global_step1061/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c325307806c1cf8898a4e7ea3108455bc15b6804cc721112a0aa89193e66a2a7
+size 21659418140
diff --git a/checkpoint-1060/global_step1061/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-1060/global_step1061/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9ed2d37e36d0c4f5610761a0d4394e424d71f598
--- /dev/null
+++ b/checkpoint-1060/global_step1061/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84294d280ce427c8e01f63c449ffd651b0337937ad787e17d7e3582046836ff9
+size 21659457372
diff --git a/checkpoint-1060/global_step1061/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-1060/global_step1061/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9a3af00c826dfc17f68f63b8e9ed4d86c1af4870
--- /dev/null
+++ b/checkpoint-1060/global_step1061/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07bfd1564a1e98896cf6d6192e969e5aa521312ffcaa748f28f8ea9a569f70e5
+size 21659417820
diff --git a/checkpoint-1060/global_step1061/mp_rank_00_model_states.pt b/checkpoint-1060/global_step1061/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4ce30eb6ab8495079efdd33f501a0bc6460d8f0b
--- /dev/null
+++ b/checkpoint-1060/global_step1061/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e34cb64ce9de3f8e9b99358f928148b80bad4c2787aa0b87a2d713b49cb42df
+size 11918643933
diff --git a/checkpoint-1060/latest b/checkpoint-1060/latest
new file mode 100644
index 0000000000000000000000000000000000000000..b52a6c65e0f7cc69ce39e58aeab5d173c1b2c92c
--- /dev/null
+++ b/checkpoint-1060/latest
@@ -0,0 +1 @@
+global_step1061
\ No newline at end of file
diff --git a/checkpoint-1060/rng_state_0.pth b/checkpoint-1060/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..eaa998991f302d82aab0d6fb2917124381885f47
--- /dev/null
+++ b/checkpoint-1060/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa1ac5057ab260a8972dfc0d45ef6ab8b7d286627e2ea4e6384a6e79e973df66
+size 14768
diff --git a/checkpoint-1060/rng_state_1.pth b/checkpoint-1060/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..73c09cf0beb6096e148693fd0b9f18d3ea0a93e1
--- /dev/null
+++ b/checkpoint-1060/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e6e6ef014686f305ff46add96daec7dcac9ea435d61a9ce5ec920efee3bfb64
+size 14768
diff --git a/checkpoint-1060/rng_state_2.pth b/checkpoint-1060/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..45d2d7dfd851dc4a0196d248bca1f3c60f7d7928
--- /dev/null
+++ b/checkpoint-1060/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f93a08f9e61c2cdbcaa7be9a9276a30073b05ba9f28491f0fd6a6cdd1b674f1
+size 14768
diff --git a/checkpoint-1060/scheduler.pt b/checkpoint-1060/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..187fd51e6e7d524566820734cd65b88dee47de4e
--- /dev/null
+++ b/checkpoint-1060/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67605d649c4e21fd125858ca561c7bf537f4d0e4dbdb264ca7774242cf8cda75
+size 1064
diff --git a/checkpoint-1060/special_tokens_map.json b/checkpoint-1060/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-1060/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-1060/tokenizer.json b/checkpoint-1060/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-1060/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-1060/tokenizer_config.json b/checkpoint-1060/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b
--- /dev/null
+++ b/checkpoint-1060/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-1060/trainer_state.json b/checkpoint-1060/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a7d033b924770b8d8c36d5c9d2918b7d4db499b7
--- /dev/null
+++ b/checkpoint-1060/trainer_state.json
@@ -0,0 +1,7453 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 4.996865203761756,
+ "eval_steps": 500,
+ "global_step": 1060,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.004702194357366771,
+ "grad_norm": 3.1606569290161133,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 1.0072,
+ "step": 1
+ },
+ {
+ "epoch": 0.009404388714733543,
+ "grad_norm": 3.2058725357055664,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 1.0134,
+ "step": 2
+ },
+ {
+ "epoch": 0.014106583072100314,
+ "grad_norm": 2.636291265487671,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 0.9635,
+ "step": 3
+ },
+ {
+ "epoch": 0.018808777429467086,
+ "grad_norm": 2.708746910095215,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 1.0068,
+ "step": 4
+ },
+ {
+ "epoch": 0.023510971786833857,
+ "grad_norm": 2.8948426246643066,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 0.9608,
+ "step": 5
+ },
+ {
+ "epoch": 0.02821316614420063,
+ "grad_norm": 2.8740086555480957,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 0.9896,
+ "step": 6
+ },
+ {
+ "epoch": 0.032915360501567396,
+ "grad_norm": 2.8338170051574707,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 0.9098,
+ "step": 7
+ },
+ {
+ "epoch": 0.03761755485893417,
+ "grad_norm": 2.7783002853393555,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 0.9733,
+ "step": 8
+ },
+ {
+ "epoch": 0.04231974921630094,
+ "grad_norm": 3.043574333190918,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 0.9943,
+ "step": 9
+ },
+ {
+ "epoch": 0.047021943573667714,
+ "grad_norm": 3.142383337020874,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 0.9475,
+ "step": 10
+ },
+ {
+ "epoch": 0.05172413793103448,
+ "grad_norm": 2.9817280769348145,
+ "learning_rate": 5.5e-07,
+ "loss": 0.9701,
+ "step": 11
+ },
+ {
+ "epoch": 0.05642633228840126,
+ "grad_norm": 2.95699405670166,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 0.9983,
+ "step": 12
+ },
+ {
+ "epoch": 0.061128526645768025,
+ "grad_norm": 2.8782453536987305,
+ "learning_rate": 6.5e-07,
+ "loss": 0.9502,
+ "step": 13
+ },
+ {
+ "epoch": 0.06583072100313479,
+ "grad_norm": 2.6715071201324463,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 0.9436,
+ "step": 14
+ },
+ {
+ "epoch": 0.07053291536050156,
+ "grad_norm": 3.869649648666382,
+ "learning_rate": 7.5e-07,
+ "loss": 0.9692,
+ "step": 15
+ },
+ {
+ "epoch": 0.07523510971786834,
+ "grad_norm": 3.060220956802368,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 0.9258,
+ "step": 16
+ },
+ {
+ "epoch": 0.07993730407523511,
+ "grad_norm": 2.8922741413116455,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 0.9719,
+ "step": 17
+ },
+ {
+ "epoch": 0.08463949843260188,
+ "grad_norm": 2.7857820987701416,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 0.9072,
+ "step": 18
+ },
+ {
+ "epoch": 0.08934169278996865,
+ "grad_norm": 2.9753293991088867,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 0.9032,
+ "step": 19
+ },
+ {
+ "epoch": 0.09404388714733543,
+ "grad_norm": 2.7989683151245117,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 0.8887,
+ "step": 20
+ },
+ {
+ "epoch": 0.0987460815047022,
+ "grad_norm": 2.3953049182891846,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 0.8968,
+ "step": 21
+ },
+ {
+ "epoch": 0.10344827586206896,
+ "grad_norm": 2.643731117248535,
+ "learning_rate": 1.1e-06,
+ "loss": 0.8501,
+ "step": 22
+ },
+ {
+ "epoch": 0.10815047021943573,
+ "grad_norm": 2.3679006099700928,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 0.8476,
+ "step": 23
+ },
+ {
+ "epoch": 0.11285266457680251,
+ "grad_norm": 2.5935540199279785,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 0.8095,
+ "step": 24
+ },
+ {
+ "epoch": 0.11755485893416928,
+ "grad_norm": 2.510300636291504,
+ "learning_rate": 1.25e-06,
+ "loss": 0.8099,
+ "step": 25
+ },
+ {
+ "epoch": 0.12225705329153605,
+ "grad_norm": 2.372344970703125,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7869,
+ "step": 26
+ },
+ {
+ "epoch": 0.12695924764890282,
+ "grad_norm": 2.303426504135132,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.7758,
+ "step": 27
+ },
+ {
+ "epoch": 0.13166144200626959,
+ "grad_norm": 1.9017939567565918,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.7498,
+ "step": 28
+ },
+ {
+ "epoch": 0.13636363636363635,
+ "grad_norm": 1.8810580968856812,
+ "learning_rate": 1.45e-06,
+ "loss": 0.7878,
+ "step": 29
+ },
+ {
+ "epoch": 0.14106583072100312,
+ "grad_norm": 1.7797424793243408,
+ "learning_rate": 1.5e-06,
+ "loss": 0.7747,
+ "step": 30
+ },
+ {
+ "epoch": 0.14576802507836992,
+ "grad_norm": 1.5053879022598267,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.7735,
+ "step": 31
+ },
+ {
+ "epoch": 0.15047021943573669,
+ "grad_norm": 1.4909234046936035,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.7654,
+ "step": 32
+ },
+ {
+ "epoch": 0.15517241379310345,
+ "grad_norm": 1.36083984375,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.6895,
+ "step": 33
+ },
+ {
+ "epoch": 0.15987460815047022,
+ "grad_norm": 1.536014199256897,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.675,
+ "step": 34
+ },
+ {
+ "epoch": 0.164576802507837,
+ "grad_norm": 1.3426779508590698,
+ "learning_rate": 1.75e-06,
+ "loss": 0.7652,
+ "step": 35
+ },
+ {
+ "epoch": 0.16927899686520376,
+ "grad_norm": 1.4900612831115723,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.6863,
+ "step": 36
+ },
+ {
+ "epoch": 0.17398119122257052,
+ "grad_norm": 1.181241750717163,
+ "learning_rate": 1.85e-06,
+ "loss": 0.7136,
+ "step": 37
+ },
+ {
+ "epoch": 0.1786833855799373,
+ "grad_norm": 1.461419701576233,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.7606,
+ "step": 38
+ },
+ {
+ "epoch": 0.1833855799373041,
+ "grad_norm": 1.04817795753479,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.6829,
+ "step": 39
+ },
+ {
+ "epoch": 0.18808777429467086,
+ "grad_norm": 1.0499993562698364,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.7144,
+ "step": 40
+ },
+ {
+ "epoch": 0.19278996865203762,
+ "grad_norm": 0.9935064315795898,
+ "learning_rate": 2.05e-06,
+ "loss": 0.6736,
+ "step": 41
+ },
+ {
+ "epoch": 0.1974921630094044,
+ "grad_norm": 0.9919099807739258,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.7151,
+ "step": 42
+ },
+ {
+ "epoch": 0.20219435736677116,
+ "grad_norm": 0.919556200504303,
+ "learning_rate": 2.15e-06,
+ "loss": 0.6847,
+ "step": 43
+ },
+ {
+ "epoch": 0.20689655172413793,
+ "grad_norm": 1.4762015342712402,
+ "learning_rate": 2.2e-06,
+ "loss": 0.6694,
+ "step": 44
+ },
+ {
+ "epoch": 0.2115987460815047,
+ "grad_norm": 0.9243163466453552,
+ "learning_rate": 2.25e-06,
+ "loss": 0.6489,
+ "step": 45
+ },
+ {
+ "epoch": 0.21630094043887146,
+ "grad_norm": 0.7614469528198242,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.6568,
+ "step": 46
+ },
+ {
+ "epoch": 0.22100313479623823,
+ "grad_norm": 0.7543922662734985,
+ "learning_rate": 2.35e-06,
+ "loss": 0.6359,
+ "step": 47
+ },
+ {
+ "epoch": 0.22570532915360503,
+ "grad_norm": 0.7558912038803101,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.6231,
+ "step": 48
+ },
+ {
+ "epoch": 0.2304075235109718,
+ "grad_norm": 0.7822129130363464,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.6691,
+ "step": 49
+ },
+ {
+ "epoch": 0.23510971786833856,
+ "grad_norm": 0.8646999597549438,
+ "learning_rate": 2.5e-06,
+ "loss": 0.682,
+ "step": 50
+ },
+ {
+ "epoch": 0.23981191222570533,
+ "grad_norm": 0.8824774622917175,
+ "learning_rate": 2.55e-06,
+ "loss": 0.6805,
+ "step": 51
+ },
+ {
+ "epoch": 0.2445141065830721,
+ "grad_norm": 0.7697399258613586,
+ "learning_rate": 2.6e-06,
+ "loss": 0.6368,
+ "step": 52
+ },
+ {
+ "epoch": 0.24921630094043887,
+ "grad_norm": 0.6522512435913086,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.6367,
+ "step": 53
+ },
+ {
+ "epoch": 0.25391849529780564,
+ "grad_norm": 0.6172305941581726,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.6291,
+ "step": 54
+ },
+ {
+ "epoch": 0.25862068965517243,
+ "grad_norm": 0.7860460877418518,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.6736,
+ "step": 55
+ },
+ {
+ "epoch": 0.26332288401253917,
+ "grad_norm": 0.6474862694740295,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.6365,
+ "step": 56
+ },
+ {
+ "epoch": 0.26802507836990597,
+ "grad_norm": 0.6867114901542664,
+ "learning_rate": 2.85e-06,
+ "loss": 0.6397,
+ "step": 57
+ },
+ {
+ "epoch": 0.2727272727272727,
+ "grad_norm": 0.7056852579116821,
+ "learning_rate": 2.9e-06,
+ "loss": 0.6138,
+ "step": 58
+ },
+ {
+ "epoch": 0.2774294670846395,
+ "grad_norm": 0.6615664958953857,
+ "learning_rate": 2.95e-06,
+ "loss": 0.6482,
+ "step": 59
+ },
+ {
+ "epoch": 0.28213166144200624,
+ "grad_norm": 0.6649022102355957,
+ "learning_rate": 3e-06,
+ "loss": 0.6745,
+ "step": 60
+ },
+ {
+ "epoch": 0.28683385579937304,
+ "grad_norm": 0.850848913192749,
+ "learning_rate": 3.05e-06,
+ "loss": 0.5956,
+ "step": 61
+ },
+ {
+ "epoch": 0.29153605015673983,
+ "grad_norm": 0.5983562469482422,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.5894,
+ "step": 62
+ },
+ {
+ "epoch": 0.2962382445141066,
+ "grad_norm": 0.6286782622337341,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.6329,
+ "step": 63
+ },
+ {
+ "epoch": 0.30094043887147337,
+ "grad_norm": 0.5919945240020752,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.6402,
+ "step": 64
+ },
+ {
+ "epoch": 0.3056426332288401,
+ "grad_norm": 0.5632765889167786,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.5862,
+ "step": 65
+ },
+ {
+ "epoch": 0.3103448275862069,
+ "grad_norm": 0.7692590951919556,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.6031,
+ "step": 66
+ },
+ {
+ "epoch": 0.31504702194357365,
+ "grad_norm": 0.7313893437385559,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.6312,
+ "step": 67
+ },
+ {
+ "epoch": 0.31974921630094044,
+ "grad_norm": 0.6097120642662048,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.5986,
+ "step": 68
+ },
+ {
+ "epoch": 0.32445141065830724,
+ "grad_norm": 0.5853808522224426,
+ "learning_rate": 3.45e-06,
+ "loss": 0.5847,
+ "step": 69
+ },
+ {
+ "epoch": 0.329153605015674,
+ "grad_norm": 0.6093555092811584,
+ "learning_rate": 3.5e-06,
+ "loss": 0.6552,
+ "step": 70
+ },
+ {
+ "epoch": 0.3338557993730408,
+ "grad_norm": 0.6106334328651428,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.6196,
+ "step": 71
+ },
+ {
+ "epoch": 0.3385579937304075,
+ "grad_norm": 0.9254828691482544,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.6005,
+ "step": 72
+ },
+ {
+ "epoch": 0.3432601880877743,
+ "grad_norm": 0.5471694469451904,
+ "learning_rate": 3.65e-06,
+ "loss": 0.5907,
+ "step": 73
+ },
+ {
+ "epoch": 0.34796238244514105,
+ "grad_norm": 0.6204228401184082,
+ "learning_rate": 3.7e-06,
+ "loss": 0.6079,
+ "step": 74
+ },
+ {
+ "epoch": 0.35266457680250785,
+ "grad_norm": 0.52458256483078,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.6001,
+ "step": 75
+ },
+ {
+ "epoch": 0.3573667711598746,
+ "grad_norm": 0.5356763601303101,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.5987,
+ "step": 76
+ },
+ {
+ "epoch": 0.3620689655172414,
+ "grad_norm": 0.5408467054367065,
+ "learning_rate": 3.85e-06,
+ "loss": 0.6104,
+ "step": 77
+ },
+ {
+ "epoch": 0.3667711598746082,
+ "grad_norm": 0.5075871348381042,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.5569,
+ "step": 78
+ },
+ {
+ "epoch": 0.3714733542319749,
+ "grad_norm": 0.8474109768867493,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6195,
+ "step": 79
+ },
+ {
+ "epoch": 0.3761755485893417,
+ "grad_norm": 0.4750897288322449,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.5399,
+ "step": 80
+ },
+ {
+ "epoch": 0.38087774294670845,
+ "grad_norm": 0.5082002878189087,
+ "learning_rate": 4.05e-06,
+ "loss": 0.5997,
+ "step": 81
+ },
+ {
+ "epoch": 0.38557993730407525,
+ "grad_norm": 0.5343796014785767,
+ "learning_rate": 4.1e-06,
+ "loss": 0.5704,
+ "step": 82
+ },
+ {
+ "epoch": 0.390282131661442,
+ "grad_norm": 0.520311713218689,
+ "learning_rate": 4.15e-06,
+ "loss": 0.5818,
+ "step": 83
+ },
+ {
+ "epoch": 0.3949843260188088,
+ "grad_norm": 0.5292978286743164,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.5852,
+ "step": 84
+ },
+ {
+ "epoch": 0.3996865203761755,
+ "grad_norm": 0.539886474609375,
+ "learning_rate": 4.25e-06,
+ "loss": 0.6057,
+ "step": 85
+ },
+ {
+ "epoch": 0.4043887147335423,
+ "grad_norm": 0.6468827128410339,
+ "learning_rate": 4.3e-06,
+ "loss": 0.6122,
+ "step": 86
+ },
+ {
+ "epoch": 0.4090909090909091,
+ "grad_norm": 0.5537365078926086,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.5652,
+ "step": 87
+ },
+ {
+ "epoch": 0.41379310344827586,
+ "grad_norm": 0.6226018667221069,
+ "learning_rate": 4.4e-06,
+ "loss": 0.5884,
+ "step": 88
+ },
+ {
+ "epoch": 0.41849529780564265,
+ "grad_norm": 0.5016945004463196,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.5877,
+ "step": 89
+ },
+ {
+ "epoch": 0.4231974921630094,
+ "grad_norm": 0.5059167146682739,
+ "learning_rate": 4.5e-06,
+ "loss": 0.5676,
+ "step": 90
+ },
+ {
+ "epoch": 0.4278996865203762,
+ "grad_norm": 0.47521743178367615,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.5929,
+ "step": 91
+ },
+ {
+ "epoch": 0.43260188087774293,
+ "grad_norm": 0.531306266784668,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.5983,
+ "step": 92
+ },
+ {
+ "epoch": 0.4373040752351097,
+ "grad_norm": 0.4965567886829376,
+ "learning_rate": 4.65e-06,
+ "loss": 0.5279,
+ "step": 93
+ },
+ {
+ "epoch": 0.44200626959247646,
+ "grad_norm": 0.5125988125801086,
+ "learning_rate": 4.7e-06,
+ "loss": 0.5436,
+ "step": 94
+ },
+ {
+ "epoch": 0.44670846394984326,
+ "grad_norm": 0.557763934135437,
+ "learning_rate": 4.75e-06,
+ "loss": 0.5496,
+ "step": 95
+ },
+ {
+ "epoch": 0.45141065830721006,
+ "grad_norm": 0.6993274092674255,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.5498,
+ "step": 96
+ },
+ {
+ "epoch": 0.4561128526645768,
+ "grad_norm": 0.5485453009605408,
+ "learning_rate": 4.85e-06,
+ "loss": 0.5552,
+ "step": 97
+ },
+ {
+ "epoch": 0.4608150470219436,
+ "grad_norm": 1.9821522235870361,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.569,
+ "step": 98
+ },
+ {
+ "epoch": 0.46551724137931033,
+ "grad_norm": 0.6074144840240479,
+ "learning_rate": 4.95e-06,
+ "loss": 0.5546,
+ "step": 99
+ },
+ {
+ "epoch": 0.4702194357366771,
+ "grad_norm": 0.5404040813446045,
+ "learning_rate": 5e-06,
+ "loss": 0.5775,
+ "step": 100
+ },
+ {
+ "epoch": 0.47492163009404387,
+ "grad_norm": 0.500438928604126,
+ "learning_rate": 4.9999910183883085e-06,
+ "loss": 0.5569,
+ "step": 101
+ },
+ {
+ "epoch": 0.47962382445141066,
+ "grad_norm": 0.5036981701850891,
+ "learning_rate": 4.999964073617768e-06,
+ "loss": 0.5663,
+ "step": 102
+ },
+ {
+ "epoch": 0.4843260188087774,
+ "grad_norm": 0.4537642300128937,
+ "learning_rate": 4.999919165881985e-06,
+ "loss": 0.5527,
+ "step": 103
+ },
+ {
+ "epoch": 0.4890282131661442,
+ "grad_norm": 0.49653521180152893,
+ "learning_rate": 4.999856295503635e-06,
+ "loss": 0.563,
+ "step": 104
+ },
+ {
+ "epoch": 0.493730407523511,
+ "grad_norm": 0.46847566962242126,
+ "learning_rate": 4.9997754629344596e-06,
+ "loss": 0.5425,
+ "step": 105
+ },
+ {
+ "epoch": 0.49843260188087773,
+ "grad_norm": 0.5192411541938782,
+ "learning_rate": 4.999676668755263e-06,
+ "loss": 0.5315,
+ "step": 106
+ },
+ {
+ "epoch": 0.5031347962382445,
+ "grad_norm": 0.5170287489891052,
+ "learning_rate": 4.999559913675912e-06,
+ "loss": 0.5627,
+ "step": 107
+ },
+ {
+ "epoch": 0.5078369905956113,
+ "grad_norm": 0.47297438979148865,
+ "learning_rate": 4.999425198535325e-06,
+ "loss": 0.5432,
+ "step": 108
+ },
+ {
+ "epoch": 0.512539184952978,
+ "grad_norm": 0.4873776137828827,
+ "learning_rate": 4.999272524301469e-06,
+ "loss": 0.5473,
+ "step": 109
+ },
+ {
+ "epoch": 0.5172413793103449,
+ "grad_norm": 0.5432935357093811,
+ "learning_rate": 4.9991018920713505e-06,
+ "loss": 0.5642,
+ "step": 110
+ },
+ {
+ "epoch": 0.5219435736677116,
+ "grad_norm": 0.4850105345249176,
+ "learning_rate": 4.9989133030710154e-06,
+ "loss": 0.548,
+ "step": 111
+ },
+ {
+ "epoch": 0.5266457680250783,
+ "grad_norm": 0.9399585723876953,
+ "learning_rate": 4.9987067586555275e-06,
+ "loss": 0.5471,
+ "step": 112
+ },
+ {
+ "epoch": 0.5313479623824452,
+ "grad_norm": 0.5167811512947083,
+ "learning_rate": 4.998482260308969e-06,
+ "loss": 0.5648,
+ "step": 113
+ },
+ {
+ "epoch": 0.5360501567398119,
+ "grad_norm": 0.5069029927253723,
+ "learning_rate": 4.998239809644427e-06,
+ "loss": 0.5568,
+ "step": 114
+ },
+ {
+ "epoch": 0.5407523510971787,
+ "grad_norm": 0.8738563656806946,
+ "learning_rate": 4.9979794084039755e-06,
+ "loss": 0.5719,
+ "step": 115
+ },
+ {
+ "epoch": 0.5454545454545454,
+ "grad_norm": 0.5216553807258606,
+ "learning_rate": 4.997701058458677e-06,
+ "loss": 0.5309,
+ "step": 116
+ },
+ {
+ "epoch": 0.5501567398119123,
+ "grad_norm": 0.9678344130516052,
+ "learning_rate": 4.997404761808554e-06,
+ "loss": 0.5645,
+ "step": 117
+ },
+ {
+ "epoch": 0.554858934169279,
+ "grad_norm": 0.496598482131958,
+ "learning_rate": 4.9970905205825845e-06,
+ "loss": 0.5711,
+ "step": 118
+ },
+ {
+ "epoch": 0.5595611285266457,
+ "grad_norm": 0.4745199680328369,
+ "learning_rate": 4.996758337038683e-06,
+ "loss": 0.5613,
+ "step": 119
+ },
+ {
+ "epoch": 0.5642633228840125,
+ "grad_norm": 0.5595977902412415,
+ "learning_rate": 4.996408213563684e-06,
+ "loss": 0.5559,
+ "step": 120
+ },
+ {
+ "epoch": 0.5689655172413793,
+ "grad_norm": 0.4743712544441223,
+ "learning_rate": 4.996040152673326e-06,
+ "loss": 0.5228,
+ "step": 121
+ },
+ {
+ "epoch": 0.5736677115987461,
+ "grad_norm": 0.5418100953102112,
+ "learning_rate": 4.995654157012233e-06,
+ "loss": 0.536,
+ "step": 122
+ },
+ {
+ "epoch": 0.5783699059561128,
+ "grad_norm": 0.521977424621582,
+ "learning_rate": 4.995250229353895e-06,
+ "loss": 0.5305,
+ "step": 123
+ },
+ {
+ "epoch": 0.5830721003134797,
+ "grad_norm": 0.5062761902809143,
+ "learning_rate": 4.99482837260065e-06,
+ "loss": 0.5417,
+ "step": 124
+ },
+ {
+ "epoch": 0.5877742946708464,
+ "grad_norm": 0.5895913243293762,
+ "learning_rate": 4.99438858978366e-06,
+ "loss": 0.573,
+ "step": 125
+ },
+ {
+ "epoch": 0.5924764890282131,
+ "grad_norm": 0.5442466139793396,
+ "learning_rate": 4.993930884062892e-06,
+ "loss": 0.5563,
+ "step": 126
+ },
+ {
+ "epoch": 0.5971786833855799,
+ "grad_norm": 0.5130571722984314,
+ "learning_rate": 4.993455258727094e-06,
+ "loss": 0.5549,
+ "step": 127
+ },
+ {
+ "epoch": 0.6018808777429467,
+ "grad_norm": 0.5579081773757935,
+ "learning_rate": 4.992961717193773e-06,
+ "loss": 0.5554,
+ "step": 128
+ },
+ {
+ "epoch": 0.6065830721003135,
+ "grad_norm": 0.6375890374183655,
+ "learning_rate": 4.9924502630091655e-06,
+ "loss": 0.5626,
+ "step": 129
+ },
+ {
+ "epoch": 0.6112852664576802,
+ "grad_norm": 0.5129190683364868,
+ "learning_rate": 4.99192089984822e-06,
+ "loss": 0.5493,
+ "step": 130
+ },
+ {
+ "epoch": 0.6159874608150471,
+ "grad_norm": 0.5293419361114502,
+ "learning_rate": 4.9913736315145614e-06,
+ "loss": 0.5565,
+ "step": 131
+ },
+ {
+ "epoch": 0.6206896551724138,
+ "grad_norm": 0.6502572298049927,
+ "learning_rate": 4.990808461940474e-06,
+ "loss": 0.5358,
+ "step": 132
+ },
+ {
+ "epoch": 0.6253918495297806,
+ "grad_norm": 0.5450296998023987,
+ "learning_rate": 4.990225395186862e-06,
+ "loss": 0.5421,
+ "step": 133
+ },
+ {
+ "epoch": 0.6300940438871473,
+ "grad_norm": 0.45506399869918823,
+ "learning_rate": 4.9896244354432314e-06,
+ "loss": 0.5396,
+ "step": 134
+ },
+ {
+ "epoch": 0.6347962382445141,
+ "grad_norm": 0.5095545649528503,
+ "learning_rate": 4.98900558702765e-06,
+ "loss": 0.5486,
+ "step": 135
+ },
+ {
+ "epoch": 0.6394984326018809,
+ "grad_norm": 0.4836446940898895,
+ "learning_rate": 4.9883688543867225e-06,
+ "loss": 0.5596,
+ "step": 136
+ },
+ {
+ "epoch": 0.6442006269592476,
+ "grad_norm": 0.5253512859344482,
+ "learning_rate": 4.987714242095558e-06,
+ "loss": 0.5308,
+ "step": 137
+ },
+ {
+ "epoch": 0.6489028213166145,
+ "grad_norm": 0.8280164003372192,
+ "learning_rate": 4.9870417548577355e-06,
+ "loss": 0.5349,
+ "step": 138
+ },
+ {
+ "epoch": 0.6536050156739812,
+ "grad_norm": 0.4729730188846588,
+ "learning_rate": 4.9863513975052696e-06,
+ "loss": 0.5416,
+ "step": 139
+ },
+ {
+ "epoch": 0.658307210031348,
+ "grad_norm": 0.5932718515396118,
+ "learning_rate": 4.985643174998578e-06,
+ "loss": 0.5638,
+ "step": 140
+ },
+ {
+ "epoch": 0.6630094043887147,
+ "grad_norm": 0.5187026262283325,
+ "learning_rate": 4.984917092426445e-06,
+ "loss": 0.5507,
+ "step": 141
+ },
+ {
+ "epoch": 0.6677115987460815,
+ "grad_norm": 0.5024245977401733,
+ "learning_rate": 4.984173155005982e-06,
+ "loss": 0.5406,
+ "step": 142
+ },
+ {
+ "epoch": 0.6724137931034483,
+ "grad_norm": 0.4735509157180786,
+ "learning_rate": 4.983411368082597e-06,
+ "loss": 0.5431,
+ "step": 143
+ },
+ {
+ "epoch": 0.677115987460815,
+ "grad_norm": 0.5040024518966675,
+ "learning_rate": 4.982631737129948e-06,
+ "loss": 0.5291,
+ "step": 144
+ },
+ {
+ "epoch": 0.6818181818181818,
+ "grad_norm": 0.47764894366264343,
+ "learning_rate": 4.98183426774991e-06,
+ "loss": 0.5677,
+ "step": 145
+ },
+ {
+ "epoch": 0.6865203761755486,
+ "grad_norm": 0.5211489796638489,
+ "learning_rate": 4.981018965672529e-06,
+ "loss": 0.566,
+ "step": 146
+ },
+ {
+ "epoch": 0.6912225705329154,
+ "grad_norm": 1.022007942199707,
+ "learning_rate": 4.98018583675599e-06,
+ "loss": 0.5476,
+ "step": 147
+ },
+ {
+ "epoch": 0.6959247648902821,
+ "grad_norm": 0.5263912677764893,
+ "learning_rate": 4.979334886986562e-06,
+ "loss": 0.5473,
+ "step": 148
+ },
+ {
+ "epoch": 0.700626959247649,
+ "grad_norm": 0.5014091730117798,
+ "learning_rate": 4.978466122478567e-06,
+ "loss": 0.5642,
+ "step": 149
+ },
+ {
+ "epoch": 0.7053291536050157,
+ "grad_norm": 0.5003350973129272,
+ "learning_rate": 4.97757954947433e-06,
+ "loss": 0.5311,
+ "step": 150
+ },
+ {
+ "epoch": 0.7100313479623824,
+ "grad_norm": 0.5010690093040466,
+ "learning_rate": 4.976675174344132e-06,
+ "loss": 0.5469,
+ "step": 151
+ },
+ {
+ "epoch": 0.7147335423197492,
+ "grad_norm": 0.45779237151145935,
+ "learning_rate": 4.975753003586172e-06,
+ "loss": 0.5273,
+ "step": 152
+ },
+ {
+ "epoch": 0.719435736677116,
+ "grad_norm": 0.6231539845466614,
+ "learning_rate": 4.974813043826513e-06,
+ "loss": 0.5182,
+ "step": 153
+ },
+ {
+ "epoch": 0.7241379310344828,
+ "grad_norm": 0.5361394286155701,
+ "learning_rate": 4.973855301819039e-06,
+ "loss": 0.5372,
+ "step": 154
+ },
+ {
+ "epoch": 0.7288401253918495,
+ "grad_norm": 0.5193538665771484,
+ "learning_rate": 4.972879784445402e-06,
+ "loss": 0.5201,
+ "step": 155
+ },
+ {
+ "epoch": 0.7335423197492164,
+ "grad_norm": 0.47956809401512146,
+ "learning_rate": 4.971886498714978e-06,
+ "loss": 0.5402,
+ "step": 156
+ },
+ {
+ "epoch": 0.7382445141065831,
+ "grad_norm": 0.5303016901016235,
+ "learning_rate": 4.97087545176481e-06,
+ "loss": 0.5174,
+ "step": 157
+ },
+ {
+ "epoch": 0.7429467084639498,
+ "grad_norm": 0.5002286434173584,
+ "learning_rate": 4.9698466508595655e-06,
+ "loss": 0.5453,
+ "step": 158
+ },
+ {
+ "epoch": 0.7476489028213166,
+ "grad_norm": 0.6070297360420227,
+ "learning_rate": 4.9688001033914756e-06,
+ "loss": 0.5327,
+ "step": 159
+ },
+ {
+ "epoch": 0.7523510971786834,
+ "grad_norm": 0.5436793565750122,
+ "learning_rate": 4.967735816880286e-06,
+ "loss": 0.544,
+ "step": 160
+ },
+ {
+ "epoch": 0.7570532915360502,
+ "grad_norm": 0.538012683391571,
+ "learning_rate": 4.966653798973205e-06,
+ "loss": 0.5233,
+ "step": 161
+ },
+ {
+ "epoch": 0.7617554858934169,
+ "grad_norm": 0.4916169345378876,
+ "learning_rate": 4.965554057444842e-06,
+ "loss": 0.5168,
+ "step": 162
+ },
+ {
+ "epoch": 0.7664576802507836,
+ "grad_norm": 0.48281437158584595,
+ "learning_rate": 4.964436600197161e-06,
+ "loss": 0.5393,
+ "step": 163
+ },
+ {
+ "epoch": 0.7711598746081505,
+ "grad_norm": 0.5184990167617798,
+ "learning_rate": 4.963301435259413e-06,
+ "loss": 0.5085,
+ "step": 164
+ },
+ {
+ "epoch": 0.7758620689655172,
+ "grad_norm": 0.4706438183784485,
+ "learning_rate": 4.962148570788088e-06,
+ "loss": 0.5299,
+ "step": 165
+ },
+ {
+ "epoch": 0.780564263322884,
+ "grad_norm": 0.6550764441490173,
+ "learning_rate": 4.96097801506685e-06,
+ "loss": 0.5192,
+ "step": 166
+ },
+ {
+ "epoch": 0.7852664576802508,
+ "grad_norm": 0.5386581420898438,
+ "learning_rate": 4.959789776506482e-06,
+ "loss": 0.5258,
+ "step": 167
+ },
+ {
+ "epoch": 0.7899686520376176,
+ "grad_norm": 0.5060779452323914,
+ "learning_rate": 4.958583863644821e-06,
+ "loss": 0.5512,
+ "step": 168
+ },
+ {
+ "epoch": 0.7946708463949843,
+ "grad_norm": 0.47050032019615173,
+ "learning_rate": 4.9573602851466985e-06,
+ "loss": 0.5176,
+ "step": 169
+ },
+ {
+ "epoch": 0.799373040752351,
+ "grad_norm": 7.3139567375183105,
+ "learning_rate": 4.9561190498038815e-06,
+ "loss": 0.5381,
+ "step": 170
+ },
+ {
+ "epoch": 0.8040752351097179,
+ "grad_norm": 0.620528519153595,
+ "learning_rate": 4.954860166535005e-06,
+ "loss": 0.5299,
+ "step": 171
+ },
+ {
+ "epoch": 0.8087774294670846,
+ "grad_norm": 0.45067766308784485,
+ "learning_rate": 4.95358364438551e-06,
+ "loss": 0.5328,
+ "step": 172
+ },
+ {
+ "epoch": 0.8134796238244514,
+ "grad_norm": 0.6771508455276489,
+ "learning_rate": 4.952289492527576e-06,
+ "loss": 0.5601,
+ "step": 173
+ },
+ {
+ "epoch": 0.8181818181818182,
+ "grad_norm": 0.518925130367279,
+ "learning_rate": 4.9509777202600605e-06,
+ "loss": 0.494,
+ "step": 174
+ },
+ {
+ "epoch": 0.822884012539185,
+ "grad_norm": 0.5191988945007324,
+ "learning_rate": 4.949648337008425e-06,
+ "loss": 0.5425,
+ "step": 175
+ },
+ {
+ "epoch": 0.8275862068965517,
+ "grad_norm": 0.8600963354110718,
+ "learning_rate": 4.948301352324674e-06,
+ "loss": 0.5332,
+ "step": 176
+ },
+ {
+ "epoch": 0.8322884012539185,
+ "grad_norm": 0.5405915379524231,
+ "learning_rate": 4.946936775887281e-06,
+ "loss": 0.5276,
+ "step": 177
+ },
+ {
+ "epoch": 0.8369905956112853,
+ "grad_norm": 0.48730772733688354,
+ "learning_rate": 4.945554617501124e-06,
+ "loss": 0.5217,
+ "step": 178
+ },
+ {
+ "epoch": 0.841692789968652,
+ "grad_norm": 0.5092865824699402,
+ "learning_rate": 4.944154887097411e-06,
+ "loss": 0.5534,
+ "step": 179
+ },
+ {
+ "epoch": 0.8463949843260188,
+ "grad_norm": 0.4994933605194092,
+ "learning_rate": 4.942737594733608e-06,
+ "loss": 0.5242,
+ "step": 180
+ },
+ {
+ "epoch": 0.8510971786833855,
+ "grad_norm": 0.4554043412208557,
+ "learning_rate": 4.941302750593373e-06,
+ "loss": 0.5424,
+ "step": 181
+ },
+ {
+ "epoch": 0.8557993730407524,
+ "grad_norm": 0.4865265488624573,
+ "learning_rate": 4.939850364986475e-06,
+ "loss": 0.482,
+ "step": 182
+ },
+ {
+ "epoch": 0.8605015673981191,
+ "grad_norm": 0.5013875365257263,
+ "learning_rate": 4.938380448348725e-06,
+ "loss": 0.4908,
+ "step": 183
+ },
+ {
+ "epoch": 0.8652037617554859,
+ "grad_norm": 0.4997917115688324,
+ "learning_rate": 4.9368930112419e-06,
+ "loss": 0.5336,
+ "step": 184
+ },
+ {
+ "epoch": 0.8699059561128527,
+ "grad_norm": 0.4783482551574707,
+ "learning_rate": 4.935388064353665e-06,
+ "loss": 0.5338,
+ "step": 185
+ },
+ {
+ "epoch": 0.8746081504702194,
+ "grad_norm": 0.7221089005470276,
+ "learning_rate": 4.9338656184975e-06,
+ "loss": 0.5327,
+ "step": 186
+ },
+ {
+ "epoch": 0.8793103448275862,
+ "grad_norm": 0.48115843534469604,
+ "learning_rate": 4.932325684612618e-06,
+ "loss": 0.5408,
+ "step": 187
+ },
+ {
+ "epoch": 0.8840125391849529,
+ "grad_norm": 0.4940219223499298,
+ "learning_rate": 4.93076827376389e-06,
+ "loss": 0.5455,
+ "step": 188
+ },
+ {
+ "epoch": 0.8887147335423198,
+ "grad_norm": 0.4754747450351715,
+ "learning_rate": 4.9291933971417635e-06,
+ "loss": 0.542,
+ "step": 189
+ },
+ {
+ "epoch": 0.8934169278996865,
+ "grad_norm": 0.548713207244873,
+ "learning_rate": 4.9276010660621835e-06,
+ "loss": 0.5292,
+ "step": 190
+ },
+ {
+ "epoch": 0.8981191222570533,
+ "grad_norm": 0.7292612195014954,
+ "learning_rate": 4.925991291966508e-06,
+ "loss": 0.5073,
+ "step": 191
+ },
+ {
+ "epoch": 0.9028213166144201,
+ "grad_norm": 0.5254770517349243,
+ "learning_rate": 4.92436408642143e-06,
+ "loss": 0.5451,
+ "step": 192
+ },
+ {
+ "epoch": 0.9075235109717869,
+ "grad_norm": 0.47938767075538635,
+ "learning_rate": 4.9227194611188934e-06,
+ "loss": 0.5204,
+ "step": 193
+ },
+ {
+ "epoch": 0.9122257053291536,
+ "grad_norm": 0.6740232706069946,
+ "learning_rate": 4.921057427876007e-06,
+ "loss": 0.4928,
+ "step": 194
+ },
+ {
+ "epoch": 0.9169278996865203,
+ "grad_norm": 0.5455343723297119,
+ "learning_rate": 4.919377998634959e-06,
+ "loss": 0.5468,
+ "step": 195
+ },
+ {
+ "epoch": 0.9216300940438872,
+ "grad_norm": 0.5001958012580872,
+ "learning_rate": 4.917681185462934e-06,
+ "loss": 0.5339,
+ "step": 196
+ },
+ {
+ "epoch": 0.9263322884012539,
+ "grad_norm": 0.5084257125854492,
+ "learning_rate": 4.915967000552028e-06,
+ "loss": 0.5259,
+ "step": 197
+ },
+ {
+ "epoch": 0.9310344827586207,
+ "grad_norm": 0.4807164967060089,
+ "learning_rate": 4.914235456219154e-06,
+ "loss": 0.5204,
+ "step": 198
+ },
+ {
+ "epoch": 0.9357366771159875,
+ "grad_norm": 0.6099370718002319,
+ "learning_rate": 4.912486564905959e-06,
+ "loss": 0.544,
+ "step": 199
+ },
+ {
+ "epoch": 0.9404388714733543,
+ "grad_norm": 0.47461947798728943,
+ "learning_rate": 4.910720339178735e-06,
+ "loss": 0.5295,
+ "step": 200
+ },
+ {
+ "epoch": 0.945141065830721,
+ "grad_norm": 0.500136137008667,
+ "learning_rate": 4.908936791728323e-06,
+ "loss": 0.5321,
+ "step": 201
+ },
+ {
+ "epoch": 0.9498432601880877,
+ "grad_norm": 0.5235631465911865,
+ "learning_rate": 4.907135935370027e-06,
+ "loss": 0.5338,
+ "step": 202
+ },
+ {
+ "epoch": 0.9545454545454546,
+ "grad_norm": 0.9285804629325867,
+ "learning_rate": 4.905317783043523e-06,
+ "loss": 0.5393,
+ "step": 203
+ },
+ {
+ "epoch": 0.9592476489028213,
+ "grad_norm": 0.4834178388118744,
+ "learning_rate": 4.9034823478127605e-06,
+ "loss": 0.5211,
+ "step": 204
+ },
+ {
+ "epoch": 0.9639498432601881,
+ "grad_norm": 0.4830580949783325,
+ "learning_rate": 4.901629642865872e-06,
+ "loss": 0.4986,
+ "step": 205
+ },
+ {
+ "epoch": 0.9686520376175548,
+ "grad_norm": 0.49718615412712097,
+ "learning_rate": 4.89975968151508e-06,
+ "loss": 0.5204,
+ "step": 206
+ },
+ {
+ "epoch": 0.9733542319749217,
+ "grad_norm": 0.5056726336479187,
+ "learning_rate": 4.8978724771965965e-06,
+ "loss": 0.5133,
+ "step": 207
+ },
+ {
+ "epoch": 0.9780564263322884,
+ "grad_norm": 0.7357563376426697,
+ "learning_rate": 4.895968043470532e-06,
+ "loss": 0.5307,
+ "step": 208
+ },
+ {
+ "epoch": 0.9827586206896551,
+ "grad_norm": 0.515610933303833,
+ "learning_rate": 4.894046394020794e-06,
+ "loss": 0.4955,
+ "step": 209
+ },
+ {
+ "epoch": 0.987460815047022,
+ "grad_norm": 0.5124618411064148,
+ "learning_rate": 4.892107542654988e-06,
+ "loss": 0.526,
+ "step": 210
+ },
+ {
+ "epoch": 0.9921630094043887,
+ "grad_norm": 0.5059565901756287,
+ "learning_rate": 4.890151503304325e-06,
+ "loss": 0.5473,
+ "step": 211
+ },
+ {
+ "epoch": 0.9968652037617555,
+ "grad_norm": 0.4806717336177826,
+ "learning_rate": 4.88817829002351e-06,
+ "loss": 0.5212,
+ "step": 212
+ },
+ {
+ "epoch": 1.0047021943573669,
+ "grad_norm": 0.9454345703125,
+ "learning_rate": 4.886187916990653e-06,
+ "loss": 1.0566,
+ "step": 213
+ },
+ {
+ "epoch": 1.0094043887147335,
+ "grad_norm": 0.4871070086956024,
+ "learning_rate": 4.884180398507163e-06,
+ "loss": 0.503,
+ "step": 214
+ },
+ {
+ "epoch": 1.0141065830721003,
+ "grad_norm": 0.45102012157440186,
+ "learning_rate": 4.882155748997636e-06,
+ "loss": 0.4954,
+ "step": 215
+ },
+ {
+ "epoch": 1.0188087774294672,
+ "grad_norm": 0.49910685420036316,
+ "learning_rate": 4.8801139830097685e-06,
+ "loss": 0.5019,
+ "step": 216
+ },
+ {
+ "epoch": 1.0235109717868338,
+ "grad_norm": 0.5155763030052185,
+ "learning_rate": 4.878055115214238e-06,
+ "loss": 0.5102,
+ "step": 217
+ },
+ {
+ "epoch": 1.0282131661442007,
+ "grad_norm": 0.4567059874534607,
+ "learning_rate": 4.875979160404607e-06,
+ "loss": 0.5069,
+ "step": 218
+ },
+ {
+ "epoch": 1.0329153605015673,
+ "grad_norm": 0.4782896935939789,
+ "learning_rate": 4.873886133497209e-06,
+ "loss": 0.5182,
+ "step": 219
+ },
+ {
+ "epoch": 1.0376175548589341,
+ "grad_norm": 0.44995731115341187,
+ "learning_rate": 4.87177604953105e-06,
+ "loss": 0.513,
+ "step": 220
+ },
+ {
+ "epoch": 1.042319749216301,
+ "grad_norm": 0.470059871673584,
+ "learning_rate": 4.869648923667694e-06,
+ "loss": 0.468,
+ "step": 221
+ },
+ {
+ "epoch": 1.0470219435736676,
+ "grad_norm": 0.5356128215789795,
+ "learning_rate": 4.867504771191154e-06,
+ "loss": 0.4942,
+ "step": 222
+ },
+ {
+ "epoch": 1.0517241379310345,
+ "grad_norm": 0.5137870907783508,
+ "learning_rate": 4.865343607507788e-06,
+ "loss": 0.5022,
+ "step": 223
+ },
+ {
+ "epoch": 1.0564263322884013,
+ "grad_norm": 0.47419992089271545,
+ "learning_rate": 4.86316544814618e-06,
+ "loss": 0.5158,
+ "step": 224
+ },
+ {
+ "epoch": 1.061128526645768,
+ "grad_norm": 0.49087393283843994,
+ "learning_rate": 4.860970308757038e-06,
+ "loss": 0.4605,
+ "step": 225
+ },
+ {
+ "epoch": 1.0658307210031348,
+ "grad_norm": 0.4988348186016083,
+ "learning_rate": 4.858758205113072e-06,
+ "loss": 0.4912,
+ "step": 226
+ },
+ {
+ "epoch": 1.0705329153605017,
+ "grad_norm": 0.44543248414993286,
+ "learning_rate": 4.856529153108888e-06,
+ "loss": 0.524,
+ "step": 227
+ },
+ {
+ "epoch": 1.0752351097178683,
+ "grad_norm": 0.5953351259231567,
+ "learning_rate": 4.854283168760868e-06,
+ "loss": 0.5001,
+ "step": 228
+ },
+ {
+ "epoch": 1.0799373040752351,
+ "grad_norm": 0.5012004375457764,
+ "learning_rate": 4.85202026820706e-06,
+ "loss": 0.4968,
+ "step": 229
+ },
+ {
+ "epoch": 1.084639498432602,
+ "grad_norm": 0.5023937821388245,
+ "learning_rate": 4.84974046770706e-06,
+ "loss": 0.5345,
+ "step": 230
+ },
+ {
+ "epoch": 1.0893416927899686,
+ "grad_norm": 0.4705684185028076,
+ "learning_rate": 4.847443783641893e-06,
+ "loss": 0.4459,
+ "step": 231
+ },
+ {
+ "epoch": 1.0940438871473355,
+ "grad_norm": 0.5082476735115051,
+ "learning_rate": 4.845130232513901e-06,
+ "loss": 0.4905,
+ "step": 232
+ },
+ {
+ "epoch": 1.098746081504702,
+ "grad_norm": 0.5283995866775513,
+ "learning_rate": 4.842799830946615e-06,
+ "loss": 0.4878,
+ "step": 233
+ },
+ {
+ "epoch": 1.103448275862069,
+ "grad_norm": 0.6373623013496399,
+ "learning_rate": 4.840452595684646e-06,
+ "loss": 0.4867,
+ "step": 234
+ },
+ {
+ "epoch": 1.1081504702194358,
+ "grad_norm": 0.4624481201171875,
+ "learning_rate": 4.83808854359356e-06,
+ "loss": 0.4793,
+ "step": 235
+ },
+ {
+ "epoch": 1.1128526645768024,
+ "grad_norm": 0.4659098982810974,
+ "learning_rate": 4.835707691659753e-06,
+ "loss": 0.4827,
+ "step": 236
+ },
+ {
+ "epoch": 1.1175548589341693,
+ "grad_norm": 0.4920850396156311,
+ "learning_rate": 4.8333100569903365e-06,
+ "loss": 0.4932,
+ "step": 237
+ },
+ {
+ "epoch": 1.1222570532915361,
+ "grad_norm": 0.492286741733551,
+ "learning_rate": 4.8308956568130094e-06,
+ "loss": 0.5144,
+ "step": 238
+ },
+ {
+ "epoch": 1.1269592476489028,
+ "grad_norm": 0.5429807901382446,
+ "learning_rate": 4.828464508475934e-06,
+ "loss": 0.5054,
+ "step": 239
+ },
+ {
+ "epoch": 1.1316614420062696,
+ "grad_norm": 2.4671998023986816,
+ "learning_rate": 4.826016629447616e-06,
+ "loss": 0.5073,
+ "step": 240
+ },
+ {
+ "epoch": 1.1363636363636362,
+ "grad_norm": 0.4593118131160736,
+ "learning_rate": 4.823552037316775e-06,
+ "loss": 0.4856,
+ "step": 241
+ },
+ {
+ "epoch": 1.141065830721003,
+ "grad_norm": 0.6855646371841431,
+ "learning_rate": 4.821070749792218e-06,
+ "loss": 0.5388,
+ "step": 242
+ },
+ {
+ "epoch": 1.14576802507837,
+ "grad_norm": 0.5722374320030212,
+ "learning_rate": 4.818572784702713e-06,
+ "loss": 0.51,
+ "step": 243
+ },
+ {
+ "epoch": 1.1504702194357366,
+ "grad_norm": 0.4901357591152191,
+ "learning_rate": 4.816058159996863e-06,
+ "loss": 0.5201,
+ "step": 244
+ },
+ {
+ "epoch": 1.1551724137931034,
+ "grad_norm": 0.4655209481716156,
+ "learning_rate": 4.813526893742972e-06,
+ "loss": 0.501,
+ "step": 245
+ },
+ {
+ "epoch": 1.1598746081504703,
+ "grad_norm": 0.7608394622802734,
+ "learning_rate": 4.810979004128924e-06,
+ "loss": 0.4961,
+ "step": 246
+ },
+ {
+ "epoch": 1.164576802507837,
+ "grad_norm": 0.4857081472873688,
+ "learning_rate": 4.808414509462042e-06,
+ "loss": 0.5174,
+ "step": 247
+ },
+ {
+ "epoch": 1.1692789968652038,
+ "grad_norm": 0.46672946214675903,
+ "learning_rate": 4.80583342816896e-06,
+ "loss": 0.484,
+ "step": 248
+ },
+ {
+ "epoch": 1.1739811912225706,
+ "grad_norm": 0.46982088685035706,
+ "learning_rate": 4.803235778795496e-06,
+ "loss": 0.5236,
+ "step": 249
+ },
+ {
+ "epoch": 1.1786833855799372,
+ "grad_norm": 0.5086098909378052,
+ "learning_rate": 4.800621580006511e-06,
+ "loss": 0.4673,
+ "step": 250
+ },
+ {
+ "epoch": 1.183385579937304,
+ "grad_norm": 0.45968860387802124,
+ "learning_rate": 4.797990850585782e-06,
+ "loss": 0.5151,
+ "step": 251
+ },
+ {
+ "epoch": 1.188087774294671,
+ "grad_norm": 0.49544984102249146,
+ "learning_rate": 4.79534360943586e-06,
+ "loss": 0.494,
+ "step": 252
+ },
+ {
+ "epoch": 1.1927899686520376,
+ "grad_norm": 0.531892716884613,
+ "learning_rate": 4.792679875577937e-06,
+ "loss": 0.4778,
+ "step": 253
+ },
+ {
+ "epoch": 1.1974921630094044,
+ "grad_norm": 0.5013542175292969,
+ "learning_rate": 4.789999668151714e-06,
+ "loss": 0.5132,
+ "step": 254
+ },
+ {
+ "epoch": 1.2021943573667713,
+ "grad_norm": 0.46963250637054443,
+ "learning_rate": 4.7873030064152545e-06,
+ "loss": 0.4938,
+ "step": 255
+ },
+ {
+ "epoch": 1.206896551724138,
+ "grad_norm": 0.465285986661911,
+ "learning_rate": 4.784589909744856e-06,
+ "loss": 0.4898,
+ "step": 256
+ },
+ {
+ "epoch": 1.2115987460815048,
+ "grad_norm": 0.5183936357498169,
+ "learning_rate": 4.7818603976349005e-06,
+ "loss": 0.5004,
+ "step": 257
+ },
+ {
+ "epoch": 1.2163009404388714,
+ "grad_norm": 0.47324836254119873,
+ "learning_rate": 4.779114489697724e-06,
+ "loss": 0.4972,
+ "step": 258
+ },
+ {
+ "epoch": 1.2210031347962382,
+ "grad_norm": 0.5208264589309692,
+ "learning_rate": 4.776352205663469e-06,
+ "loss": 0.5023,
+ "step": 259
+ },
+ {
+ "epoch": 1.225705329153605,
+ "grad_norm": 0.5583804845809937,
+ "learning_rate": 4.773573565379947e-06,
+ "loss": 0.5099,
+ "step": 260
+ },
+ {
+ "epoch": 1.2304075235109717,
+ "grad_norm": 0.5016160011291504,
+ "learning_rate": 4.770778588812489e-06,
+ "loss": 0.4765,
+ "step": 261
+ },
+ {
+ "epoch": 1.2351097178683386,
+ "grad_norm": 0.50210040807724,
+ "learning_rate": 4.7679672960438135e-06,
+ "loss": 0.5029,
+ "step": 262
+ },
+ {
+ "epoch": 1.2398119122257054,
+ "grad_norm": 0.6636150479316711,
+ "learning_rate": 4.765139707273872e-06,
+ "loss": 0.4909,
+ "step": 263
+ },
+ {
+ "epoch": 1.244514106583072,
+ "grad_norm": 0.4798625111579895,
+ "learning_rate": 4.762295842819707e-06,
+ "loss": 0.5012,
+ "step": 264
+ },
+ {
+ "epoch": 1.249216300940439,
+ "grad_norm": 0.5282374024391174,
+ "learning_rate": 4.759435723115308e-06,
+ "loss": 0.4681,
+ "step": 265
+ },
+ {
+ "epoch": 1.2539184952978055,
+ "grad_norm": 0.5356930494308472,
+ "learning_rate": 4.756559368711463e-06,
+ "loss": 0.506,
+ "step": 266
+ },
+ {
+ "epoch": 1.2586206896551724,
+ "grad_norm": 0.4857093095779419,
+ "learning_rate": 4.75366680027561e-06,
+ "loss": 0.4889,
+ "step": 267
+ },
+ {
+ "epoch": 1.2633228840125392,
+ "grad_norm": 0.484018474817276,
+ "learning_rate": 4.7507580385916906e-06,
+ "loss": 0.4899,
+ "step": 268
+ },
+ {
+ "epoch": 1.2680250783699059,
+ "grad_norm": 0.49720871448516846,
+ "learning_rate": 4.747833104559999e-06,
+ "loss": 0.4654,
+ "step": 269
+ },
+ {
+ "epoch": 1.2727272727272727,
+ "grad_norm": 0.4631911516189575,
+ "learning_rate": 4.744892019197033e-06,
+ "loss": 0.4796,
+ "step": 270
+ },
+ {
+ "epoch": 1.2774294670846396,
+ "grad_norm": 0.5116872787475586,
+ "learning_rate": 4.74193480363534e-06,
+ "loss": 0.4883,
+ "step": 271
+ },
+ {
+ "epoch": 1.2821316614420062,
+ "grad_norm": 0.5275093913078308,
+ "learning_rate": 4.738961479123373e-06,
+ "loss": 0.496,
+ "step": 272
+ },
+ {
+ "epoch": 1.286833855799373,
+ "grad_norm": 0.5001885890960693,
+ "learning_rate": 4.735972067025326e-06,
+ "loss": 0.5012,
+ "step": 273
+ },
+ {
+ "epoch": 1.29153605015674,
+ "grad_norm": 0.5875861048698425,
+ "learning_rate": 4.732966588820991e-06,
+ "loss": 0.4951,
+ "step": 274
+ },
+ {
+ "epoch": 1.2962382445141065,
+ "grad_norm": 0.4893011748790741,
+ "learning_rate": 4.729945066105599e-06,
+ "loss": 0.4742,
+ "step": 275
+ },
+ {
+ "epoch": 1.3009404388714734,
+ "grad_norm": 0.4648543894290924,
+ "learning_rate": 4.726907520589664e-06,
+ "loss": 0.466,
+ "step": 276
+ },
+ {
+ "epoch": 1.3056426332288402,
+ "grad_norm": 0.5300162434577942,
+ "learning_rate": 4.72385397409883e-06,
+ "loss": 0.5072,
+ "step": 277
+ },
+ {
+ "epoch": 1.3103448275862069,
+ "grad_norm": 0.4667080044746399,
+ "learning_rate": 4.720784448573712e-06,
+ "loss": 0.4986,
+ "step": 278
+ },
+ {
+ "epoch": 1.3150470219435737,
+ "grad_norm": 0.5278895497322083,
+ "learning_rate": 4.717698966069739e-06,
+ "loss": 0.5269,
+ "step": 279
+ },
+ {
+ "epoch": 1.3197492163009406,
+ "grad_norm": 0.5325866937637329,
+ "learning_rate": 4.7145975487569965e-06,
+ "loss": 0.5074,
+ "step": 280
+ },
+ {
+ "epoch": 1.3244514106583072,
+ "grad_norm": 0.500861644744873,
+ "learning_rate": 4.711480218920064e-06,
+ "loss": 0.4695,
+ "step": 281
+ },
+ {
+ "epoch": 1.329153605015674,
+ "grad_norm": 0.5263222455978394,
+ "learning_rate": 4.708346998957859e-06,
+ "loss": 0.5173,
+ "step": 282
+ },
+ {
+ "epoch": 1.3338557993730409,
+ "grad_norm": 0.622900128364563,
+ "learning_rate": 4.705197911383473e-06,
+ "loss": 0.4905,
+ "step": 283
+ },
+ {
+ "epoch": 1.3385579937304075,
+ "grad_norm": 0.49273768067359924,
+ "learning_rate": 4.7020329788240115e-06,
+ "loss": 0.4743,
+ "step": 284
+ },
+ {
+ "epoch": 1.3432601880877744,
+ "grad_norm": 0.49558964371681213,
+ "learning_rate": 4.6988522240204325e-06,
+ "loss": 0.4824,
+ "step": 285
+ },
+ {
+ "epoch": 1.347962382445141,
+ "grad_norm": 0.4743976891040802,
+ "learning_rate": 4.695655669827377e-06,
+ "loss": 0.4977,
+ "step": 286
+ },
+ {
+ "epoch": 1.3526645768025078,
+ "grad_norm": 0.49542659521102905,
+ "learning_rate": 4.6924433392130135e-06,
+ "loss": 0.4924,
+ "step": 287
+ },
+ {
+ "epoch": 1.3573667711598745,
+ "grad_norm": 0.7385990619659424,
+ "learning_rate": 4.689215255258866e-06,
+ "loss": 0.5091,
+ "step": 288
+ },
+ {
+ "epoch": 1.3620689655172413,
+ "grad_norm": 0.4826123118400574,
+ "learning_rate": 4.685971441159653e-06,
+ "loss": 0.4791,
+ "step": 289
+ },
+ {
+ "epoch": 1.3667711598746082,
+ "grad_norm": 0.5389033555984497,
+ "learning_rate": 4.682711920223115e-06,
+ "loss": 0.4751,
+ "step": 290
+ },
+ {
+ "epoch": 1.3714733542319748,
+ "grad_norm": 0.5059546232223511,
+ "learning_rate": 4.679436715869856e-06,
+ "loss": 0.499,
+ "step": 291
+ },
+ {
+ "epoch": 1.3761755485893417,
+ "grad_norm": 0.5682849884033203,
+ "learning_rate": 4.676145851633166e-06,
+ "loss": 0.5143,
+ "step": 292
+ },
+ {
+ "epoch": 1.3808777429467085,
+ "grad_norm": 0.4754337668418884,
+ "learning_rate": 4.672839351158856e-06,
+ "loss": 0.4997,
+ "step": 293
+ },
+ {
+ "epoch": 1.3855799373040751,
+ "grad_norm": 0.5227643847465515,
+ "learning_rate": 4.669517238205089e-06,
+ "loss": 0.4834,
+ "step": 294
+ },
+ {
+ "epoch": 1.390282131661442,
+ "grad_norm": 0.4954044222831726,
+ "learning_rate": 4.666179536642208e-06,
+ "loss": 0.483,
+ "step": 295
+ },
+ {
+ "epoch": 1.3949843260188088,
+ "grad_norm": 0.4909021556377411,
+ "learning_rate": 4.662826270452565e-06,
+ "loss": 0.4808,
+ "step": 296
+ },
+ {
+ "epoch": 1.3996865203761755,
+ "grad_norm": 0.4666971266269684,
+ "learning_rate": 4.659457463730347e-06,
+ "loss": 0.488,
+ "step": 297
+ },
+ {
+ "epoch": 1.4043887147335423,
+ "grad_norm": 0.5064187049865723,
+ "learning_rate": 4.6560731406814056e-06,
+ "loss": 0.5046,
+ "step": 298
+ },
+ {
+ "epoch": 1.4090909090909092,
+ "grad_norm": 0.4958318769931793,
+ "learning_rate": 4.65267332562308e-06,
+ "loss": 0.5102,
+ "step": 299
+ },
+ {
+ "epoch": 1.4137931034482758,
+ "grad_norm": 0.5080632567405701,
+ "learning_rate": 4.649258042984026e-06,
+ "loss": 0.5055,
+ "step": 300
+ },
+ {
+ "epoch": 1.4184952978056427,
+ "grad_norm": 0.46236541867256165,
+ "learning_rate": 4.6458273173040395e-06,
+ "loss": 0.4606,
+ "step": 301
+ },
+ {
+ "epoch": 1.4231974921630095,
+ "grad_norm": 1.8524898290634155,
+ "learning_rate": 4.642381173233874e-06,
+ "loss": 0.5002,
+ "step": 302
+ },
+ {
+ "epoch": 1.4278996865203761,
+ "grad_norm": 0.5202615261077881,
+ "learning_rate": 4.638919635535073e-06,
+ "loss": 0.4562,
+ "step": 303
+ },
+ {
+ "epoch": 1.432601880877743,
+ "grad_norm": 0.5293647050857544,
+ "learning_rate": 4.635442729079788e-06,
+ "loss": 0.4806,
+ "step": 304
+ },
+ {
+ "epoch": 1.4373040752351098,
+ "grad_norm": 0.5165356993675232,
+ "learning_rate": 4.6319504788505956e-06,
+ "loss": 0.4775,
+ "step": 305
+ },
+ {
+ "epoch": 1.4420062695924765,
+ "grad_norm": 0.5092841386795044,
+ "learning_rate": 4.628442909940325e-06,
+ "loss": 0.4892,
+ "step": 306
+ },
+ {
+ "epoch": 1.4467084639498433,
+ "grad_norm": 0.511424720287323,
+ "learning_rate": 4.624920047551874e-06,
+ "loss": 0.506,
+ "step": 307
+ },
+ {
+ "epoch": 1.4514106583072102,
+ "grad_norm": 0.5631566643714905,
+ "learning_rate": 4.621381916998029e-06,
+ "loss": 0.4741,
+ "step": 308
+ },
+ {
+ "epoch": 1.4561128526645768,
+ "grad_norm": 0.4748315215110779,
+ "learning_rate": 4.6178285437012806e-06,
+ "loss": 0.5084,
+ "step": 309
+ },
+ {
+ "epoch": 1.4608150470219436,
+ "grad_norm": 0.47158119082450867,
+ "learning_rate": 4.6142599531936435e-06,
+ "loss": 0.4697,
+ "step": 310
+ },
+ {
+ "epoch": 1.4655172413793103,
+ "grad_norm": 0.5358107089996338,
+ "learning_rate": 4.610676171116475e-06,
+ "loss": 0.491,
+ "step": 311
+ },
+ {
+ "epoch": 1.4702194357366771,
+ "grad_norm": 0.47717440128326416,
+ "learning_rate": 4.607077223220286e-06,
+ "loss": 0.4948,
+ "step": 312
+ },
+ {
+ "epoch": 1.4749216300940438,
+ "grad_norm": 0.5041193962097168,
+ "learning_rate": 4.603463135364556e-06,
+ "loss": 0.4648,
+ "step": 313
+ },
+ {
+ "epoch": 1.4796238244514106,
+ "grad_norm": 0.9311274290084839,
+ "learning_rate": 4.5998339335175555e-06,
+ "loss": 0.4866,
+ "step": 314
+ },
+ {
+ "epoch": 1.4843260188087775,
+ "grad_norm": 0.47408604621887207,
+ "learning_rate": 4.596189643756147e-06,
+ "loss": 0.4634,
+ "step": 315
+ },
+ {
+ "epoch": 1.489028213166144,
+ "grad_norm": 0.5052632093429565,
+ "learning_rate": 4.592530292265609e-06,
+ "loss": 0.4843,
+ "step": 316
+ },
+ {
+ "epoch": 1.493730407523511,
+ "grad_norm": 0.5100846886634827,
+ "learning_rate": 4.58885590533944e-06,
+ "loss": 0.4942,
+ "step": 317
+ },
+ {
+ "epoch": 1.4984326018808778,
+ "grad_norm": 0.5132214426994324,
+ "learning_rate": 4.585166509379173e-06,
+ "loss": 0.5135,
+ "step": 318
+ },
+ {
+ "epoch": 1.5031347962382444,
+ "grad_norm": 11.112855911254883,
+ "learning_rate": 4.581462130894186e-06,
+ "loss": 0.4933,
+ "step": 319
+ },
+ {
+ "epoch": 1.5078369905956113,
+ "grad_norm": 0.4873805642127991,
+ "learning_rate": 4.57774279650151e-06,
+ "loss": 0.483,
+ "step": 320
+ },
+ {
+ "epoch": 1.5125391849529781,
+ "grad_norm": 0.5026459693908691,
+ "learning_rate": 4.574008532925638e-06,
+ "loss": 0.5075,
+ "step": 321
+ },
+ {
+ "epoch": 1.5172413793103448,
+ "grad_norm": 0.489947110414505,
+ "learning_rate": 4.570259366998336e-06,
+ "loss": 0.4954,
+ "step": 322
+ },
+ {
+ "epoch": 1.5219435736677116,
+ "grad_norm": 0.48120853304862976,
+ "learning_rate": 4.566495325658445e-06,
+ "loss": 0.5221,
+ "step": 323
+ },
+ {
+ "epoch": 1.5266457680250785,
+ "grad_norm": 0.4880066514015198,
+ "learning_rate": 4.5627164359516915e-06,
+ "loss": 0.5031,
+ "step": 324
+ },
+ {
+ "epoch": 1.531347962382445,
+ "grad_norm": 0.5048410892486572,
+ "learning_rate": 4.558922725030491e-06,
+ "loss": 0.4757,
+ "step": 325
+ },
+ {
+ "epoch": 1.536050156739812,
+ "grad_norm": 0.7033756375312805,
+ "learning_rate": 4.555114220153755e-06,
+ "loss": 0.4285,
+ "step": 326
+ },
+ {
+ "epoch": 1.5407523510971788,
+ "grad_norm": 0.4716516435146332,
+ "learning_rate": 4.551290948686693e-06,
+ "loss": 0.5121,
+ "step": 327
+ },
+ {
+ "epoch": 1.5454545454545454,
+ "grad_norm": 0.4782696068286896,
+ "learning_rate": 4.547452938100615e-06,
+ "loss": 0.5176,
+ "step": 328
+ },
+ {
+ "epoch": 1.5501567398119123,
+ "grad_norm": 0.5119273066520691,
+ "learning_rate": 4.54360021597274e-06,
+ "loss": 0.4941,
+ "step": 329
+ },
+ {
+ "epoch": 1.5548589341692791,
+ "grad_norm": 0.5010069608688354,
+ "learning_rate": 4.539732809985989e-06,
+ "loss": 0.4862,
+ "step": 330
+ },
+ {
+ "epoch": 1.5595611285266457,
+ "grad_norm": 0.5129932165145874,
+ "learning_rate": 4.535850747928796e-06,
+ "loss": 0.4978,
+ "step": 331
+ },
+ {
+ "epoch": 1.5642633228840124,
+ "grad_norm": 0.4957594573497772,
+ "learning_rate": 4.531954057694897e-06,
+ "loss": 0.4814,
+ "step": 332
+ },
+ {
+ "epoch": 1.5689655172413794,
+ "grad_norm": 0.5642824172973633,
+ "learning_rate": 4.5280427672831414e-06,
+ "loss": 0.4888,
+ "step": 333
+ },
+ {
+ "epoch": 1.573667711598746,
+ "grad_norm": 0.4562854468822479,
+ "learning_rate": 4.524116904797281e-06,
+ "loss": 0.4648,
+ "step": 334
+ },
+ {
+ "epoch": 1.5783699059561127,
+ "grad_norm": 0.4849218428134918,
+ "learning_rate": 4.520176498445774e-06,
+ "loss": 0.476,
+ "step": 335
+ },
+ {
+ "epoch": 1.5830721003134798,
+ "grad_norm": 0.5046947002410889,
+ "learning_rate": 4.516221576541581e-06,
+ "loss": 0.4776,
+ "step": 336
+ },
+ {
+ "epoch": 1.5877742946708464,
+ "grad_norm": 0.48211777210235596,
+ "learning_rate": 4.512252167501959e-06,
+ "loss": 0.479,
+ "step": 337
+ },
+ {
+ "epoch": 1.592476489028213,
+ "grad_norm": 0.4812171459197998,
+ "learning_rate": 4.508268299848262e-06,
+ "loss": 0.4849,
+ "step": 338
+ },
+ {
+ "epoch": 1.59717868338558,
+ "grad_norm": 0.5865142345428467,
+ "learning_rate": 4.50427000220573e-06,
+ "loss": 0.499,
+ "step": 339
+ },
+ {
+ "epoch": 1.6018808777429467,
+ "grad_norm": 0.49277785420417786,
+ "learning_rate": 4.50025730330329e-06,
+ "loss": 0.475,
+ "step": 340
+ },
+ {
+ "epoch": 1.6065830721003134,
+ "grad_norm": 0.46771496534347534,
+ "learning_rate": 4.4962302319733445e-06,
+ "loss": 0.494,
+ "step": 341
+ },
+ {
+ "epoch": 1.6112852664576802,
+ "grad_norm": 0.5189441442489624,
+ "learning_rate": 4.492188817151565e-06,
+ "loss": 0.5275,
+ "step": 342
+ },
+ {
+ "epoch": 1.615987460815047,
+ "grad_norm": 0.48845574259757996,
+ "learning_rate": 4.488133087876688e-06,
+ "loss": 0.4676,
+ "step": 343
+ },
+ {
+ "epoch": 1.6206896551724137,
+ "grad_norm": 0.47189632058143616,
+ "learning_rate": 4.484063073290301e-06,
+ "loss": 0.4642,
+ "step": 344
+ },
+ {
+ "epoch": 1.6253918495297806,
+ "grad_norm": 0.5442587733268738,
+ "learning_rate": 4.479978802636637e-06,
+ "loss": 0.4981,
+ "step": 345
+ },
+ {
+ "epoch": 1.6300940438871474,
+ "grad_norm": 0.5048685073852539,
+ "learning_rate": 4.475880305262362e-06,
+ "loss": 0.5037,
+ "step": 346
+ },
+ {
+ "epoch": 1.634796238244514,
+ "grad_norm": 0.4781409800052643,
+ "learning_rate": 4.471767610616366e-06,
+ "loss": 0.4932,
+ "step": 347
+ },
+ {
+ "epoch": 1.6394984326018809,
+ "grad_norm": 0.47388938069343567,
+ "learning_rate": 4.467640748249549e-06,
+ "loss": 0.4687,
+ "step": 348
+ },
+ {
+ "epoch": 1.6442006269592477,
+ "grad_norm": 0.529712438583374,
+ "learning_rate": 4.4634997478146125e-06,
+ "loss": 0.487,
+ "step": 349
+ },
+ {
+ "epoch": 1.6489028213166144,
+ "grad_norm": 0.5114791393280029,
+ "learning_rate": 4.459344639065842e-06,
+ "loss": 0.4809,
+ "step": 350
+ },
+ {
+ "epoch": 1.6536050156739812,
+ "grad_norm": 0.45415258407592773,
+ "learning_rate": 4.455175451858897e-06,
+ "loss": 0.4901,
+ "step": 351
+ },
+ {
+ "epoch": 1.658307210031348,
+ "grad_norm": 0.5842339396476746,
+ "learning_rate": 4.450992216150592e-06,
+ "loss": 0.499,
+ "step": 352
+ },
+ {
+ "epoch": 1.6630094043887147,
+ "grad_norm": 0.48795560002326965,
+ "learning_rate": 4.446794961998689e-06,
+ "loss": 0.4659,
+ "step": 353
+ },
+ {
+ "epoch": 1.6677115987460815,
+ "grad_norm": 0.5531855225563049,
+ "learning_rate": 4.442583719561671e-06,
+ "loss": 0.4923,
+ "step": 354
+ },
+ {
+ "epoch": 1.6724137931034484,
+ "grad_norm": 0.5827644467353821,
+ "learning_rate": 4.438358519098536e-06,
+ "loss": 0.4991,
+ "step": 355
+ },
+ {
+ "epoch": 1.677115987460815,
+ "grad_norm": 0.5260423421859741,
+ "learning_rate": 4.4341193909685685e-06,
+ "loss": 0.4843,
+ "step": 356
+ },
+ {
+ "epoch": 1.6818181818181817,
+ "grad_norm": 0.4969344437122345,
+ "learning_rate": 4.429866365631134e-06,
+ "loss": 0.4915,
+ "step": 357
+ },
+ {
+ "epoch": 1.6865203761755487,
+ "grad_norm": 0.4725005030632019,
+ "learning_rate": 4.425599473645447e-06,
+ "loss": 0.4804,
+ "step": 358
+ },
+ {
+ "epoch": 1.6912225705329154,
+ "grad_norm": 0.47171467542648315,
+ "learning_rate": 4.421318745670364e-06,
+ "loss": 0.4823,
+ "step": 359
+ },
+ {
+ "epoch": 1.695924764890282,
+ "grad_norm": 0.4839799106121063,
+ "learning_rate": 4.4170242124641524e-06,
+ "loss": 0.4585,
+ "step": 360
+ },
+ {
+ "epoch": 1.700626959247649,
+ "grad_norm": 0.4786856472492218,
+ "learning_rate": 4.412715904884277e-06,
+ "loss": 0.49,
+ "step": 361
+ },
+ {
+ "epoch": 1.7053291536050157,
+ "grad_norm": 0.49980080127716064,
+ "learning_rate": 4.4083938538871735e-06,
+ "loss": 0.4675,
+ "step": 362
+ },
+ {
+ "epoch": 1.7100313479623823,
+ "grad_norm": 0.5201369524002075,
+ "learning_rate": 4.4040580905280295e-06,
+ "loss": 0.4862,
+ "step": 363
+ },
+ {
+ "epoch": 1.7147335423197492,
+ "grad_norm": 0.7051575183868408,
+ "learning_rate": 4.3997086459605586e-06,
+ "loss": 0.4822,
+ "step": 364
+ },
+ {
+ "epoch": 1.719435736677116,
+ "grad_norm": 0.48206666111946106,
+ "learning_rate": 4.395345551436779e-06,
+ "loss": 0.5076,
+ "step": 365
+ },
+ {
+ "epoch": 1.7241379310344827,
+ "grad_norm": 0.4817257821559906,
+ "learning_rate": 4.390968838306788e-06,
+ "loss": 0.4623,
+ "step": 366
+ },
+ {
+ "epoch": 1.7288401253918495,
+ "grad_norm": 0.5547840595245361,
+ "learning_rate": 4.386578538018535e-06,
+ "loss": 0.461,
+ "step": 367
+ },
+ {
+ "epoch": 1.7335423197492164,
+ "grad_norm": 0.5085346698760986,
+ "learning_rate": 4.382174682117598e-06,
+ "loss": 0.5068,
+ "step": 368
+ },
+ {
+ "epoch": 1.738244514106583,
+ "grad_norm": 0.4870692193508148,
+ "learning_rate": 4.377757302246956e-06,
+ "loss": 0.4403,
+ "step": 369
+ },
+ {
+ "epoch": 1.7429467084639498,
+ "grad_norm": 0.49482715129852295,
+ "learning_rate": 4.373326430146762e-06,
+ "loss": 0.4986,
+ "step": 370
+ },
+ {
+ "epoch": 1.7476489028213167,
+ "grad_norm": 0.5474854707717896,
+ "learning_rate": 4.368882097654113e-06,
+ "loss": 0.4938,
+ "step": 371
+ },
+ {
+ "epoch": 1.7523510971786833,
+ "grad_norm": 0.5055244565010071,
+ "learning_rate": 4.364424336702825e-06,
+ "loss": 0.4711,
+ "step": 372
+ },
+ {
+ "epoch": 1.7570532915360502,
+ "grad_norm": 0.48241329193115234,
+ "learning_rate": 4.3599531793232e-06,
+ "loss": 0.4856,
+ "step": 373
+ },
+ {
+ "epoch": 1.761755485893417,
+ "grad_norm": 0.4932602047920227,
+ "learning_rate": 4.355468657641797e-06,
+ "loss": 0.4818,
+ "step": 374
+ },
+ {
+ "epoch": 1.7664576802507836,
+ "grad_norm": 0.5512160658836365,
+ "learning_rate": 4.3509708038812035e-06,
+ "loss": 0.4864,
+ "step": 375
+ },
+ {
+ "epoch": 1.7711598746081505,
+ "grad_norm": 0.47026327252388,
+ "learning_rate": 4.346459650359798e-06,
+ "loss": 0.4825,
+ "step": 376
+ },
+ {
+ "epoch": 1.7758620689655173,
+ "grad_norm": 0.4831086993217468,
+ "learning_rate": 4.341935229491525e-06,
+ "loss": 0.4541,
+ "step": 377
+ },
+ {
+ "epoch": 1.780564263322884,
+ "grad_norm": 0.5045217871665955,
+ "learning_rate": 4.337397573785659e-06,
+ "loss": 0.5025,
+ "step": 378
+ },
+ {
+ "epoch": 1.7852664576802508,
+ "grad_norm": 0.5657753348350525,
+ "learning_rate": 4.332846715846566e-06,
+ "loss": 0.4698,
+ "step": 379
+ },
+ {
+ "epoch": 1.7899686520376177,
+ "grad_norm": 0.49546748399734497,
+ "learning_rate": 4.328282688373479e-06,
+ "loss": 0.4911,
+ "step": 380
+ },
+ {
+ "epoch": 1.7946708463949843,
+ "grad_norm": 0.5037291049957275,
+ "learning_rate": 4.323705524160258e-06,
+ "loss": 0.4877,
+ "step": 381
+ },
+ {
+ "epoch": 1.799373040752351,
+ "grad_norm": 0.5256901383399963,
+ "learning_rate": 4.319115256095149e-06,
+ "loss": 0.4662,
+ "step": 382
+ },
+ {
+ "epoch": 1.804075235109718,
+ "grad_norm": 0.4890702962875366,
+ "learning_rate": 4.314511917160557e-06,
+ "loss": 0.4683,
+ "step": 383
+ },
+ {
+ "epoch": 1.8087774294670846,
+ "grad_norm": 0.4724109470844269,
+ "learning_rate": 4.3098955404328045e-06,
+ "loss": 0.4602,
+ "step": 384
+ },
+ {
+ "epoch": 1.8134796238244513,
+ "grad_norm": 0.4933278560638428,
+ "learning_rate": 4.305266159081895e-06,
+ "loss": 0.4806,
+ "step": 385
+ },
+ {
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.5068219304084778,
+ "learning_rate": 4.3006238063712725e-06,
+ "loss": 0.4647,
+ "step": 386
+ },
+ {
+ "epoch": 1.822884012539185,
+ "grad_norm": 0.5293509364128113,
+ "learning_rate": 4.295968515657583e-06,
+ "loss": 0.4998,
+ "step": 387
+ },
+ {
+ "epoch": 1.8275862068965516,
+ "grad_norm": 0.4775199294090271,
+ "learning_rate": 4.29130032039044e-06,
+ "loss": 0.4821,
+ "step": 388
+ },
+ {
+ "epoch": 1.8322884012539185,
+ "grad_norm": 0.4914006292819977,
+ "learning_rate": 4.2866192541121755e-06,
+ "loss": 0.4735,
+ "step": 389
+ },
+ {
+ "epoch": 1.8369905956112853,
+ "grad_norm": 0.5009908080101013,
+ "learning_rate": 4.281925350457606e-06,
+ "loss": 0.4741,
+ "step": 390
+ },
+ {
+ "epoch": 1.841692789968652,
+ "grad_norm": 0.47211164236068726,
+ "learning_rate": 4.277218643153787e-06,
+ "loss": 0.4786,
+ "step": 391
+ },
+ {
+ "epoch": 1.8463949843260188,
+ "grad_norm": 1.9644113779067993,
+ "learning_rate": 4.272499166019771e-06,
+ "loss": 0.4759,
+ "step": 392
+ },
+ {
+ "epoch": 1.8510971786833856,
+ "grad_norm": 0.535971999168396,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.4665,
+ "step": 393
+ },
+ {
+ "epoch": 1.8557993730407523,
+ "grad_norm": 0.4666787385940552,
+ "learning_rate": 4.2630220379959006e-06,
+ "loss": 0.4417,
+ "step": 394
+ },
+ {
+ "epoch": 1.8605015673981191,
+ "grad_norm": 0.5976264476776123,
+ "learning_rate": 4.258264455201953e-06,
+ "loss": 0.4665,
+ "step": 395
+ },
+ {
+ "epoch": 1.865203761755486,
+ "grad_norm": 0.4814331531524658,
+ "learning_rate": 4.2534942387691335e-06,
+ "loss": 0.4896,
+ "step": 396
+ },
+ {
+ "epoch": 1.8699059561128526,
+ "grad_norm": 0.4929859936237335,
+ "learning_rate": 4.248711422972829e-06,
+ "loss": 0.4765,
+ "step": 397
+ },
+ {
+ "epoch": 1.8746081504702194,
+ "grad_norm": 0.517914354801178,
+ "learning_rate": 4.243916042178954e-06,
+ "loss": 0.4601,
+ "step": 398
+ },
+ {
+ "epoch": 1.8793103448275863,
+ "grad_norm": 0.47731271386146545,
+ "learning_rate": 4.239108130843709e-06,
+ "loss": 0.469,
+ "step": 399
+ },
+ {
+ "epoch": 1.884012539184953,
+ "grad_norm": 0.4939954876899719,
+ "learning_rate": 4.234287723513326e-06,
+ "loss": 0.4929,
+ "step": 400
+ },
+ {
+ "epoch": 1.8887147335423198,
+ "grad_norm": 0.48573923110961914,
+ "learning_rate": 4.229454854823827e-06,
+ "loss": 0.4913,
+ "step": 401
+ },
+ {
+ "epoch": 1.8934169278996866,
+ "grad_norm": 0.5146409273147583,
+ "learning_rate": 4.224609559500772e-06,
+ "loss": 0.502,
+ "step": 402
+ },
+ {
+ "epoch": 1.8981191222570533,
+ "grad_norm": 0.4884675443172455,
+ "learning_rate": 4.21975187235901e-06,
+ "loss": 0.4541,
+ "step": 403
+ },
+ {
+ "epoch": 1.90282131661442,
+ "grad_norm": 0.4871810972690582,
+ "learning_rate": 4.21488182830243e-06,
+ "loss": 0.4811,
+ "step": 404
+ },
+ {
+ "epoch": 1.907523510971787,
+ "grad_norm": 0.5089552402496338,
+ "learning_rate": 4.209999462323706e-06,
+ "loss": 0.4584,
+ "step": 405
+ },
+ {
+ "epoch": 1.9122257053291536,
+ "grad_norm": 0.6191231608390808,
+ "learning_rate": 4.20510480950405e-06,
+ "loss": 0.4885,
+ "step": 406
+ },
+ {
+ "epoch": 1.9169278996865202,
+ "grad_norm": 0.5512096285820007,
+ "learning_rate": 4.200197905012961e-06,
+ "loss": 0.4529,
+ "step": 407
+ },
+ {
+ "epoch": 1.9216300940438873,
+ "grad_norm": 0.4743112027645111,
+ "learning_rate": 4.195278784107965e-06,
+ "loss": 0.4702,
+ "step": 408
+ },
+ {
+ "epoch": 1.926332288401254,
+ "grad_norm": 0.4635118544101715,
+ "learning_rate": 4.19034748213437e-06,
+ "loss": 0.4718,
+ "step": 409
+ },
+ {
+ "epoch": 1.9310344827586206,
+ "grad_norm": 0.48715919256210327,
+ "learning_rate": 4.185404034525008e-06,
+ "loss": 0.4638,
+ "step": 410
+ },
+ {
+ "epoch": 1.9357366771159876,
+ "grad_norm": 0.5373724102973938,
+ "learning_rate": 4.180448476799981e-06,
+ "loss": 0.5009,
+ "step": 411
+ },
+ {
+ "epoch": 1.9404388714733543,
+ "grad_norm": 0.4978715479373932,
+ "learning_rate": 4.175480844566404e-06,
+ "loss": 0.4726,
+ "step": 412
+ },
+ {
+ "epoch": 1.9451410658307209,
+ "grad_norm": 0.44817060232162476,
+ "learning_rate": 4.170501173518152e-06,
+ "loss": 0.4683,
+ "step": 413
+ },
+ {
+ "epoch": 1.9498432601880877,
+ "grad_norm": 0.48472973704338074,
+ "learning_rate": 4.165509499435604e-06,
+ "loss": 0.4662,
+ "step": 414
+ },
+ {
+ "epoch": 1.9545454545454546,
+ "grad_norm": 0.6567174792289734,
+ "learning_rate": 4.16050585818538e-06,
+ "loss": 0.4801,
+ "step": 415
+ },
+ {
+ "epoch": 1.9592476489028212,
+ "grad_norm": 0.5131425857543945,
+ "learning_rate": 4.155490285720092e-06,
+ "loss": 0.5036,
+ "step": 416
+ },
+ {
+ "epoch": 1.963949843260188,
+ "grad_norm": 0.46051982045173645,
+ "learning_rate": 4.150462818078079e-06,
+ "loss": 0.4911,
+ "step": 417
+ },
+ {
+ "epoch": 1.968652037617555,
+ "grad_norm": 0.5288883447647095,
+ "learning_rate": 4.145423491383153e-06,
+ "loss": 0.4871,
+ "step": 418
+ },
+ {
+ "epoch": 1.9733542319749215,
+ "grad_norm": 0.5143817663192749,
+ "learning_rate": 4.14037234184433e-06,
+ "loss": 0.5027,
+ "step": 419
+ },
+ {
+ "epoch": 1.9780564263322884,
+ "grad_norm": 0.46323707699775696,
+ "learning_rate": 4.135309405755583e-06,
+ "loss": 0.4876,
+ "step": 420
+ },
+ {
+ "epoch": 1.9827586206896552,
+ "grad_norm": 0.5239706039428711,
+ "learning_rate": 4.130234719495574e-06,
+ "loss": 0.4702,
+ "step": 421
+ },
+ {
+ "epoch": 1.9874608150470219,
+ "grad_norm": 0.538753867149353,
+ "learning_rate": 4.125148319527391e-06,
+ "loss": 0.4638,
+ "step": 422
+ },
+ {
+ "epoch": 1.9921630094043887,
+ "grad_norm": 0.5180181860923767,
+ "learning_rate": 4.1200502423982904e-06,
+ "loss": 0.4841,
+ "step": 423
+ },
+ {
+ "epoch": 1.9968652037617556,
+ "grad_norm": 0.6698167324066162,
+ "learning_rate": 4.1149405247394295e-06,
+ "loss": 0.4882,
+ "step": 424
+ },
+ {
+ "epoch": 2.0047021943573666,
+ "grad_norm": 0.9728522896766663,
+ "learning_rate": 4.10981920326561e-06,
+ "loss": 0.9125,
+ "step": 425
+ },
+ {
+ "epoch": 2.0094043887147337,
+ "grad_norm": 0.7356107831001282,
+ "learning_rate": 4.104686314775009e-06,
+ "loss": 0.4422,
+ "step": 426
+ },
+ {
+ "epoch": 2.0141065830721003,
+ "grad_norm": 0.44414228200912476,
+ "learning_rate": 4.099541896148914e-06,
+ "loss": 0.4511,
+ "step": 427
+ },
+ {
+ "epoch": 2.018808777429467,
+ "grad_norm": 0.5738011002540588,
+ "learning_rate": 4.094385984351462e-06,
+ "loss": 0.4457,
+ "step": 428
+ },
+ {
+ "epoch": 2.023510971786834,
+ "grad_norm": 0.4643106460571289,
+ "learning_rate": 4.0892186164293715e-06,
+ "loss": 0.4644,
+ "step": 429
+ },
+ {
+ "epoch": 2.0282131661442007,
+ "grad_norm": 0.5355309247970581,
+ "learning_rate": 4.0840398295116745e-06,
+ "loss": 0.4535,
+ "step": 430
+ },
+ {
+ "epoch": 2.0329153605015673,
+ "grad_norm": 0.512458324432373,
+ "learning_rate": 4.078849660809456e-06,
+ "loss": 0.4481,
+ "step": 431
+ },
+ {
+ "epoch": 2.0376175548589344,
+ "grad_norm": 0.5055253505706787,
+ "learning_rate": 4.073648147615579e-06,
+ "loss": 0.4309,
+ "step": 432
+ },
+ {
+ "epoch": 2.042319749216301,
+ "grad_norm": 0.5128353834152222,
+ "learning_rate": 4.068435327304421e-06,
+ "loss": 0.4562,
+ "step": 433
+ },
+ {
+ "epoch": 2.0470219435736676,
+ "grad_norm": 0.4432103633880615,
+ "learning_rate": 4.063211237331603e-06,
+ "loss": 0.4535,
+ "step": 434
+ },
+ {
+ "epoch": 2.0517241379310347,
+ "grad_norm": 0.5092498660087585,
+ "learning_rate": 4.057975915233725e-06,
+ "loss": 0.4385,
+ "step": 435
+ },
+ {
+ "epoch": 2.0564263322884013,
+ "grad_norm": 0.4798133671283722,
+ "learning_rate": 4.052729398628089e-06,
+ "loss": 0.466,
+ "step": 436
+ },
+ {
+ "epoch": 2.061128526645768,
+ "grad_norm": 0.5094019770622253,
+ "learning_rate": 4.047471725212437e-06,
+ "loss": 0.4624,
+ "step": 437
+ },
+ {
+ "epoch": 2.0658307210031346,
+ "grad_norm": 0.5814178586006165,
+ "learning_rate": 4.042202932764673e-06,
+ "loss": 0.4472,
+ "step": 438
+ },
+ {
+ "epoch": 2.0705329153605017,
+ "grad_norm": 0.503394365310669,
+ "learning_rate": 4.036923059142595e-06,
+ "loss": 0.4481,
+ "step": 439
+ },
+ {
+ "epoch": 2.0752351097178683,
+ "grad_norm": 0.5108861923217773,
+ "learning_rate": 4.031632142283623e-06,
+ "loss": 0.4416,
+ "step": 440
+ },
+ {
+ "epoch": 2.079937304075235,
+ "grad_norm": 0.5303971171379089,
+ "learning_rate": 4.026330220204524e-06,
+ "loss": 0.4515,
+ "step": 441
+ },
+ {
+ "epoch": 2.084639498432602,
+ "grad_norm": 0.45014286041259766,
+ "learning_rate": 4.021017331001146e-06,
+ "loss": 0.441,
+ "step": 442
+ },
+ {
+ "epoch": 2.0893416927899686,
+ "grad_norm": 0.5371219515800476,
+ "learning_rate": 4.015693512848131e-06,
+ "loss": 0.4471,
+ "step": 443
+ },
+ {
+ "epoch": 2.0940438871473352,
+ "grad_norm": 0.5105510354042053,
+ "learning_rate": 4.0103588039986556e-06,
+ "loss": 0.4534,
+ "step": 444
+ },
+ {
+ "epoch": 2.0987460815047023,
+ "grad_norm": 0.4960611164569855,
+ "learning_rate": 4.005013242784146e-06,
+ "loss": 0.46,
+ "step": 445
+ },
+ {
+ "epoch": 2.103448275862069,
+ "grad_norm": 0.500354528427124,
+ "learning_rate": 3.999656867614006e-06,
+ "loss": 0.45,
+ "step": 446
+ },
+ {
+ "epoch": 2.1081504702194356,
+ "grad_norm": 0.4733876585960388,
+ "learning_rate": 3.994289716975341e-06,
+ "loss": 0.4644,
+ "step": 447
+ },
+ {
+ "epoch": 2.1128526645768027,
+ "grad_norm": 0.5002915263175964,
+ "learning_rate": 3.988911829432682e-06,
+ "loss": 0.4493,
+ "step": 448
+ },
+ {
+ "epoch": 2.1175548589341693,
+ "grad_norm": 0.48520293831825256,
+ "learning_rate": 3.983523243627706e-06,
+ "loss": 0.4458,
+ "step": 449
+ },
+ {
+ "epoch": 2.122257053291536,
+ "grad_norm": 0.6339934468269348,
+ "learning_rate": 3.978123998278962e-06,
+ "loss": 0.4352,
+ "step": 450
+ },
+ {
+ "epoch": 2.126959247648903,
+ "grad_norm": 1.172338843345642,
+ "learning_rate": 3.97271413218159e-06,
+ "loss": 0.4664,
+ "step": 451
+ },
+ {
+ "epoch": 2.1316614420062696,
+ "grad_norm": 0.47842296957969666,
+ "learning_rate": 3.9672936842070425e-06,
+ "loss": 0.4604,
+ "step": 452
+ },
+ {
+ "epoch": 2.1363636363636362,
+ "grad_norm": 0.506851077079773,
+ "learning_rate": 3.9618626933028086e-06,
+ "loss": 0.4674,
+ "step": 453
+ },
+ {
+ "epoch": 2.1410658307210033,
+ "grad_norm": 0.4922677278518677,
+ "learning_rate": 3.956421198492128e-06,
+ "loss": 0.4476,
+ "step": 454
+ },
+ {
+ "epoch": 2.14576802507837,
+ "grad_norm": 0.5307339429855347,
+ "learning_rate": 3.950969238873714e-06,
+ "loss": 0.4463,
+ "step": 455
+ },
+ {
+ "epoch": 2.1504702194357366,
+ "grad_norm": 0.5131121873855591,
+ "learning_rate": 3.9455068536214765e-06,
+ "loss": 0.4779,
+ "step": 456
+ },
+ {
+ "epoch": 2.1551724137931036,
+ "grad_norm": 0.5438089966773987,
+ "learning_rate": 3.9400340819842335e-06,
+ "loss": 0.4563,
+ "step": 457
+ },
+ {
+ "epoch": 2.1598746081504703,
+ "grad_norm": 0.7426711916923523,
+ "learning_rate": 3.934550963285432e-06,
+ "loss": 0.4561,
+ "step": 458
+ },
+ {
+ "epoch": 2.164576802507837,
+ "grad_norm": 0.482920378446579,
+ "learning_rate": 3.9290575369228664e-06,
+ "loss": 0.4293,
+ "step": 459
+ },
+ {
+ "epoch": 2.169278996865204,
+ "grad_norm": 0.6583715081214905,
+ "learning_rate": 3.923553842368396e-06,
+ "loss": 0.4682,
+ "step": 460
+ },
+ {
+ "epoch": 2.1739811912225706,
+ "grad_norm": 0.47901806235313416,
+ "learning_rate": 3.918039919167658e-06,
+ "loss": 0.4342,
+ "step": 461
+ },
+ {
+ "epoch": 2.1786833855799372,
+ "grad_norm": 0.4929746389389038,
+ "learning_rate": 3.912515806939786e-06,
+ "loss": 0.4478,
+ "step": 462
+ },
+ {
+ "epoch": 2.183385579937304,
+ "grad_norm": 0.48205333948135376,
+ "learning_rate": 3.906981545377124e-06,
+ "loss": 0.4595,
+ "step": 463
+ },
+ {
+ "epoch": 2.188087774294671,
+ "grad_norm": 0.5059337019920349,
+ "learning_rate": 3.901437174244943e-06,
+ "loss": 0.4294,
+ "step": 464
+ },
+ {
+ "epoch": 2.1927899686520376,
+ "grad_norm": 0.4752981662750244,
+ "learning_rate": 3.895882733381154e-06,
+ "loss": 0.448,
+ "step": 465
+ },
+ {
+ "epoch": 2.197492163009404,
+ "grad_norm": 0.5249196290969849,
+ "learning_rate": 3.890318262696023e-06,
+ "loss": 0.4655,
+ "step": 466
+ },
+ {
+ "epoch": 2.2021943573667713,
+ "grad_norm": 0.48044726252555847,
+ "learning_rate": 3.8847438021718805e-06,
+ "loss": 0.4413,
+ "step": 467
+ },
+ {
+ "epoch": 2.206896551724138,
+ "grad_norm": 0.84516841173172,
+ "learning_rate": 3.879159391862839e-06,
+ "loss": 0.4645,
+ "step": 468
+ },
+ {
+ "epoch": 2.2115987460815045,
+ "grad_norm": 0.5334392786026001,
+ "learning_rate": 3.873565071894503e-06,
+ "loss": 0.4347,
+ "step": 469
+ },
+ {
+ "epoch": 2.2163009404388716,
+ "grad_norm": 0.5113687515258789,
+ "learning_rate": 3.86796088246368e-06,
+ "loss": 0.4314,
+ "step": 470
+ },
+ {
+ "epoch": 2.2210031347962382,
+ "grad_norm": 0.5226101279258728,
+ "learning_rate": 3.8623468638380905e-06,
+ "loss": 0.418,
+ "step": 471
+ },
+ {
+ "epoch": 2.225705329153605,
+ "grad_norm": 0.4901522099971771,
+ "learning_rate": 3.856723056356085e-06,
+ "loss": 0.4597,
+ "step": 472
+ },
+ {
+ "epoch": 2.230407523510972,
+ "grad_norm": 0.5312012434005737,
+ "learning_rate": 3.851089500426346e-06,
+ "loss": 0.4444,
+ "step": 473
+ },
+ {
+ "epoch": 2.2351097178683386,
+ "grad_norm": 0.5347906351089478,
+ "learning_rate": 3.845446236527605e-06,
+ "loss": 0.4447,
+ "step": 474
+ },
+ {
+ "epoch": 2.239811912225705,
+ "grad_norm": 0.4781494438648224,
+ "learning_rate": 3.8397933052083445e-06,
+ "loss": 0.462,
+ "step": 475
+ },
+ {
+ "epoch": 2.2445141065830723,
+ "grad_norm": 0.5215012431144714,
+ "learning_rate": 3.834130747086512e-06,
+ "loss": 0.4475,
+ "step": 476
+ },
+ {
+ "epoch": 2.249216300940439,
+ "grad_norm": 0.5048666000366211,
+ "learning_rate": 3.828458602849226e-06,
+ "loss": 0.4483,
+ "step": 477
+ },
+ {
+ "epoch": 2.2539184952978055,
+ "grad_norm": 0.5508173108100891,
+ "learning_rate": 3.822776913252485e-06,
+ "loss": 0.4511,
+ "step": 478
+ },
+ {
+ "epoch": 2.2586206896551726,
+ "grad_norm": 0.5031043887138367,
+ "learning_rate": 3.817085719120872e-06,
+ "loss": 0.4019,
+ "step": 479
+ },
+ {
+ "epoch": 2.2633228840125392,
+ "grad_norm": 0.508939802646637,
+ "learning_rate": 3.811385061347263e-06,
+ "loss": 0.4461,
+ "step": 480
+ },
+ {
+ "epoch": 2.268025078369906,
+ "grad_norm": 0.5605170726776123,
+ "learning_rate": 3.805674980892535e-06,
+ "loss": 0.4695,
+ "step": 481
+ },
+ {
+ "epoch": 2.2727272727272725,
+ "grad_norm": 0.5526806712150574,
+ "learning_rate": 3.7999555187852667e-06,
+ "loss": 0.4575,
+ "step": 482
+ },
+ {
+ "epoch": 2.2774294670846396,
+ "grad_norm": 0.47659724950790405,
+ "learning_rate": 3.7942267161214497e-06,
+ "loss": 0.4433,
+ "step": 483
+ },
+ {
+ "epoch": 2.282131661442006,
+ "grad_norm": 0.49713975191116333,
+ "learning_rate": 3.7884886140641884e-06,
+ "loss": 0.4692,
+ "step": 484
+ },
+ {
+ "epoch": 2.2868338557993733,
+ "grad_norm": 0.48685988783836365,
+ "learning_rate": 3.7827412538434062e-06,
+ "loss": 0.4328,
+ "step": 485
+ },
+ {
+ "epoch": 2.29153605015674,
+ "grad_norm": 0.5074832439422607,
+ "learning_rate": 3.7769846767555495e-06,
+ "loss": 0.4598,
+ "step": 486
+ },
+ {
+ "epoch": 2.2962382445141065,
+ "grad_norm": 0.5333994030952454,
+ "learning_rate": 3.7712189241632898e-06,
+ "loss": 0.4554,
+ "step": 487
+ },
+ {
+ "epoch": 2.300940438871473,
+ "grad_norm": 0.49985551834106445,
+ "learning_rate": 3.7654440374952288e-06,
+ "loss": 0.4421,
+ "step": 488
+ },
+ {
+ "epoch": 2.30564263322884,
+ "grad_norm": 0.4791257679462433,
+ "learning_rate": 3.7596600582455976e-06,
+ "loss": 0.4187,
+ "step": 489
+ },
+ {
+ "epoch": 2.310344827586207,
+ "grad_norm": 0.4951220154762268,
+ "learning_rate": 3.75386702797396e-06,
+ "loss": 0.4205,
+ "step": 490
+ },
+ {
+ "epoch": 2.3150470219435735,
+ "grad_norm": 0.4765990674495697,
+ "learning_rate": 3.7480649883049164e-06,
+ "loss": 0.4251,
+ "step": 491
+ },
+ {
+ "epoch": 2.3197492163009406,
+ "grad_norm": 0.5125405192375183,
+ "learning_rate": 3.7422539809277993e-06,
+ "loss": 0.4361,
+ "step": 492
+ },
+ {
+ "epoch": 2.324451410658307,
+ "grad_norm": 0.5286112427711487,
+ "learning_rate": 3.736434047596379e-06,
+ "loss": 0.4423,
+ "step": 493
+ },
+ {
+ "epoch": 2.329153605015674,
+ "grad_norm": 0.47961002588272095,
+ "learning_rate": 3.73060523012856e-06,
+ "loss": 0.453,
+ "step": 494
+ },
+ {
+ "epoch": 2.333855799373041,
+ "grad_norm": 0.5857998728752136,
+ "learning_rate": 3.724767570406082e-06,
+ "loss": 0.4674,
+ "step": 495
+ },
+ {
+ "epoch": 2.3385579937304075,
+ "grad_norm": 0.5348326563835144,
+ "learning_rate": 3.7189211103742206e-06,
+ "loss": 0.4267,
+ "step": 496
+ },
+ {
+ "epoch": 2.343260188087774,
+ "grad_norm": 0.4718475937843323,
+ "learning_rate": 3.7130658920414818e-06,
+ "loss": 0.4619,
+ "step": 497
+ },
+ {
+ "epoch": 2.347962382445141,
+ "grad_norm": 0.44225215911865234,
+ "learning_rate": 3.7072019574793034e-06,
+ "loss": 0.4712,
+ "step": 498
+ },
+ {
+ "epoch": 2.352664576802508,
+ "grad_norm": 0.48492008447647095,
+ "learning_rate": 3.701329348821752e-06,
+ "loss": 0.4521,
+ "step": 499
+ },
+ {
+ "epoch": 2.3573667711598745,
+ "grad_norm": 0.49741214513778687,
+ "learning_rate": 3.695448108265221e-06,
+ "loss": 0.4378,
+ "step": 500
+ },
+ {
+ "epoch": 2.3620689655172415,
+ "grad_norm": 0.5086454749107361,
+ "learning_rate": 3.6895582780681254e-06,
+ "loss": 0.4349,
+ "step": 501
+ },
+ {
+ "epoch": 2.366771159874608,
+ "grad_norm": 0.49111631512641907,
+ "learning_rate": 3.683659900550598e-06,
+ "loss": 0.4625,
+ "step": 502
+ },
+ {
+ "epoch": 2.371473354231975,
+ "grad_norm": 0.5006322264671326,
+ "learning_rate": 3.6777530180941894e-06,
+ "loss": 0.4457,
+ "step": 503
+ },
+ {
+ "epoch": 2.376175548589342,
+ "grad_norm": 0.5934097170829773,
+ "learning_rate": 3.671837673141559e-06,
+ "loss": 0.4306,
+ "step": 504
+ },
+ {
+ "epoch": 2.3808777429467085,
+ "grad_norm": 0.626039981842041,
+ "learning_rate": 3.6659139081961707e-06,
+ "loss": 0.4464,
+ "step": 505
+ },
+ {
+ "epoch": 2.385579937304075,
+ "grad_norm": 0.4751131236553192,
+ "learning_rate": 3.6599817658219916e-06,
+ "loss": 0.4508,
+ "step": 506
+ },
+ {
+ "epoch": 2.3902821316614418,
+ "grad_norm": 1.4542276859283447,
+ "learning_rate": 3.6540412886431796e-06,
+ "loss": 0.4606,
+ "step": 507
+ },
+ {
+ "epoch": 2.394984326018809,
+ "grad_norm": 0.5189768075942993,
+ "learning_rate": 3.648092519343783e-06,
+ "loss": 0.4435,
+ "step": 508
+ },
+ {
+ "epoch": 2.3996865203761755,
+ "grad_norm": 1.4583938121795654,
+ "learning_rate": 3.642135500667431e-06,
+ "loss": 0.4314,
+ "step": 509
+ },
+ {
+ "epoch": 2.4043887147335425,
+ "grad_norm": 0.5038107633590698,
+ "learning_rate": 3.6361702754170247e-06,
+ "loss": 0.4463,
+ "step": 510
+ },
+ {
+ "epoch": 2.409090909090909,
+ "grad_norm": 0.5786447525024414,
+ "learning_rate": 3.630196886454435e-06,
+ "loss": 0.4281,
+ "step": 511
+ },
+ {
+ "epoch": 2.413793103448276,
+ "grad_norm": 0.48684218525886536,
+ "learning_rate": 3.62421537670019e-06,
+ "loss": 0.4432,
+ "step": 512
+ },
+ {
+ "epoch": 2.4184952978056424,
+ "grad_norm": 0.5117013454437256,
+ "learning_rate": 3.618225789133167e-06,
+ "loss": 0.4464,
+ "step": 513
+ },
+ {
+ "epoch": 2.4231974921630095,
+ "grad_norm": 0.49249181151390076,
+ "learning_rate": 3.612228166790287e-06,
+ "loss": 0.4465,
+ "step": 514
+ },
+ {
+ "epoch": 2.427899686520376,
+ "grad_norm": 0.5761134624481201,
+ "learning_rate": 3.606222552766201e-06,
+ "loss": 0.4539,
+ "step": 515
+ },
+ {
+ "epoch": 2.4326018808777428,
+ "grad_norm": 0.4839339256286621,
+ "learning_rate": 3.6002089902129844e-06,
+ "loss": 0.4469,
+ "step": 516
+ },
+ {
+ "epoch": 2.43730407523511,
+ "grad_norm": 0.4765976369380951,
+ "learning_rate": 3.5941875223398225e-06,
+ "loss": 0.4379,
+ "step": 517
+ },
+ {
+ "epoch": 2.4420062695924765,
+ "grad_norm": 0.5239338874816895,
+ "learning_rate": 3.588158192412707e-06,
+ "loss": 0.4354,
+ "step": 518
+ },
+ {
+ "epoch": 2.446708463949843,
+ "grad_norm": 0.48244595527648926,
+ "learning_rate": 3.582121043754116e-06,
+ "loss": 0.438,
+ "step": 519
+ },
+ {
+ "epoch": 2.45141065830721,
+ "grad_norm": 0.4641244411468506,
+ "learning_rate": 3.5760761197427097e-06,
+ "loss": 0.438,
+ "step": 520
+ },
+ {
+ "epoch": 2.456112852664577,
+ "grad_norm": 0.48468074202537537,
+ "learning_rate": 3.570023463813017e-06,
+ "loss": 0.4306,
+ "step": 521
+ },
+ {
+ "epoch": 2.4608150470219434,
+ "grad_norm": 0.48626402020454407,
+ "learning_rate": 3.5639631194551216e-06,
+ "loss": 0.4531,
+ "step": 522
+ },
+ {
+ "epoch": 2.4655172413793105,
+ "grad_norm": 0.5581764578819275,
+ "learning_rate": 3.557895130214352e-06,
+ "loss": 0.4451,
+ "step": 523
+ },
+ {
+ "epoch": 2.470219435736677,
+ "grad_norm": 0.6739279627799988,
+ "learning_rate": 3.5518195396909653e-06,
+ "loss": 0.4636,
+ "step": 524
+ },
+ {
+ "epoch": 2.4749216300940438,
+ "grad_norm": 0.550710916519165,
+ "learning_rate": 3.5457363915398384e-06,
+ "loss": 0.4513,
+ "step": 525
+ },
+ {
+ "epoch": 2.479623824451411,
+ "grad_norm": 0.479632705450058,
+ "learning_rate": 3.539645729470151e-06,
+ "loss": 0.4387,
+ "step": 526
+ },
+ {
+ "epoch": 2.4843260188087775,
+ "grad_norm": 0.48741331696510315,
+ "learning_rate": 3.5335475972450715e-06,
+ "loss": 0.4388,
+ "step": 527
+ },
+ {
+ "epoch": 2.489028213166144,
+ "grad_norm": 0.4964964985847473,
+ "learning_rate": 3.5274420386814458e-06,
+ "loss": 0.4643,
+ "step": 528
+ },
+ {
+ "epoch": 2.493730407523511,
+ "grad_norm": 0.5134934186935425,
+ "learning_rate": 3.521329097649478e-06,
+ "loss": 0.4454,
+ "step": 529
+ },
+ {
+ "epoch": 2.498432601880878,
+ "grad_norm": 0.4962058961391449,
+ "learning_rate": 3.515208818072418e-06,
+ "loss": 0.4408,
+ "step": 530
+ },
+ {
+ "epoch": 2.5031347962382444,
+ "grad_norm": 0.5611489415168762,
+ "learning_rate": 3.509081243926247e-06,
+ "loss": 0.4306,
+ "step": 531
+ },
+ {
+ "epoch": 2.507836990595611,
+ "grad_norm": 0.7012472748756409,
+ "learning_rate": 3.5029464192393557e-06,
+ "loss": 0.4614,
+ "step": 532
+ },
+ {
+ "epoch": 2.512539184952978,
+ "grad_norm": 0.5351004004478455,
+ "learning_rate": 3.4968043880922363e-06,
+ "loss": 0.4151,
+ "step": 533
+ },
+ {
+ "epoch": 2.5172413793103448,
+ "grad_norm": 0.5087808966636658,
+ "learning_rate": 3.4906551946171603e-06,
+ "loss": 0.4242,
+ "step": 534
+ },
+ {
+ "epoch": 2.521943573667712,
+ "grad_norm": 0.5459093451499939,
+ "learning_rate": 3.484498882997861e-06,
+ "loss": 0.4215,
+ "step": 535
+ },
+ {
+ "epoch": 2.5266457680250785,
+ "grad_norm": 0.49804285168647766,
+ "learning_rate": 3.478335497469219e-06,
+ "loss": 0.4492,
+ "step": 536
+ },
+ {
+ "epoch": 2.531347962382445,
+ "grad_norm": 0.4959704875946045,
+ "learning_rate": 3.472165082316943e-06,
+ "loss": 0.4511,
+ "step": 537
+ },
+ {
+ "epoch": 2.5360501567398117,
+ "grad_norm": 0.5059382319450378,
+ "learning_rate": 3.465987681877251e-06,
+ "loss": 0.4419,
+ "step": 538
+ },
+ {
+ "epoch": 2.540752351097179,
+ "grad_norm": 0.7398380637168884,
+ "learning_rate": 3.4598033405365527e-06,
+ "loss": 0.4548,
+ "step": 539
+ },
+ {
+ "epoch": 2.5454545454545454,
+ "grad_norm": 0.5326687693595886,
+ "learning_rate": 3.45361210273113e-06,
+ "loss": 0.4473,
+ "step": 540
+ },
+ {
+ "epoch": 2.5501567398119125,
+ "grad_norm": 0.5069761872291565,
+ "learning_rate": 3.447414012946818e-06,
+ "loss": 0.4343,
+ "step": 541
+ },
+ {
+ "epoch": 2.554858934169279,
+ "grad_norm": 0.45915964245796204,
+ "learning_rate": 3.4412091157186853e-06,
+ "loss": 0.4499,
+ "step": 542
+ },
+ {
+ "epoch": 2.5595611285266457,
+ "grad_norm": 0.5174360275268555,
+ "learning_rate": 3.4349974556307146e-06,
+ "loss": 0.44,
+ "step": 543
+ },
+ {
+ "epoch": 2.5642633228840124,
+ "grad_norm": 0.5008105039596558,
+ "learning_rate": 3.4287790773154807e-06,
+ "loss": 0.4648,
+ "step": 544
+ },
+ {
+ "epoch": 2.5689655172413794,
+ "grad_norm": 0.5628801584243774,
+ "learning_rate": 3.4225540254538297e-06,
+ "loss": 0.462,
+ "step": 545
+ },
+ {
+ "epoch": 2.573667711598746,
+ "grad_norm": 0.9913654923439026,
+ "learning_rate": 3.416322344774562e-06,
+ "loss": 0.4403,
+ "step": 546
+ },
+ {
+ "epoch": 2.5783699059561127,
+ "grad_norm": 0.5034172534942627,
+ "learning_rate": 3.4100840800541055e-06,
+ "loss": 0.4622,
+ "step": 547
+ },
+ {
+ "epoch": 2.58307210031348,
+ "grad_norm": 0.495516836643219,
+ "learning_rate": 3.4038392761161986e-06,
+ "loss": 0.4523,
+ "step": 548
+ },
+ {
+ "epoch": 2.5877742946708464,
+ "grad_norm": 0.48142367601394653,
+ "learning_rate": 3.3975879778315634e-06,
+ "loss": 0.4242,
+ "step": 549
+ },
+ {
+ "epoch": 2.592476489028213,
+ "grad_norm": 0.4635900557041168,
+ "learning_rate": 3.391330230117587e-06,
+ "loss": 0.3949,
+ "step": 550
+ },
+ {
+ "epoch": 2.5971786833855797,
+ "grad_norm": 0.4769044816493988,
+ "learning_rate": 3.385066077937997e-06,
+ "loss": 0.4651,
+ "step": 551
+ },
+ {
+ "epoch": 2.6018808777429467,
+ "grad_norm": 1.059553861618042,
+ "learning_rate": 3.378795566302541e-06,
+ "loss": 0.4243,
+ "step": 552
+ },
+ {
+ "epoch": 2.6065830721003134,
+ "grad_norm": 0.512134850025177,
+ "learning_rate": 3.372518740266658e-06,
+ "loss": 0.4435,
+ "step": 553
+ },
+ {
+ "epoch": 2.6112852664576804,
+ "grad_norm": 0.5267173647880554,
+ "learning_rate": 3.36623564493116e-06,
+ "loss": 0.4558,
+ "step": 554
+ },
+ {
+ "epoch": 2.615987460815047,
+ "grad_norm": 0.49343907833099365,
+ "learning_rate": 3.3599463254419047e-06,
+ "loss": 0.4598,
+ "step": 555
+ },
+ {
+ "epoch": 2.6206896551724137,
+ "grad_norm": 0.5496839284896851,
+ "learning_rate": 3.3536508269894724e-06,
+ "loss": 0.4669,
+ "step": 556
+ },
+ {
+ "epoch": 2.6253918495297803,
+ "grad_norm": 0.5957831740379333,
+ "learning_rate": 3.347349194808842e-06,
+ "loss": 0.4533,
+ "step": 557
+ },
+ {
+ "epoch": 2.6300940438871474,
+ "grad_norm": 0.5049230456352234,
+ "learning_rate": 3.3410414741790625e-06,
+ "loss": 0.4293,
+ "step": 558
+ },
+ {
+ "epoch": 2.634796238244514,
+ "grad_norm": 0.5167728066444397,
+ "learning_rate": 3.3347277104229332e-06,
+ "loss": 0.443,
+ "step": 559
+ },
+ {
+ "epoch": 2.639498432601881,
+ "grad_norm": 0.6090758442878723,
+ "learning_rate": 3.3284079489066728e-06,
+ "loss": 0.4378,
+ "step": 560
+ },
+ {
+ "epoch": 2.6442006269592477,
+ "grad_norm": 0.5165027379989624,
+ "learning_rate": 3.3220822350395966e-06,
+ "loss": 0.4302,
+ "step": 561
+ },
+ {
+ "epoch": 2.6489028213166144,
+ "grad_norm": 0.5152680277824402,
+ "learning_rate": 3.31575061427379e-06,
+ "loss": 0.4311,
+ "step": 562
+ },
+ {
+ "epoch": 2.653605015673981,
+ "grad_norm": 0.547235906124115,
+ "learning_rate": 3.3094131321037783e-06,
+ "loss": 0.4371,
+ "step": 563
+ },
+ {
+ "epoch": 2.658307210031348,
+ "grad_norm": 0.521981418132782,
+ "learning_rate": 3.303069834066206e-06,
+ "loss": 0.4346,
+ "step": 564
+ },
+ {
+ "epoch": 2.6630094043887147,
+ "grad_norm": 0.5127217769622803,
+ "learning_rate": 3.2967207657395055e-06,
+ "loss": 0.474,
+ "step": 565
+ },
+ {
+ "epoch": 2.6677115987460818,
+ "grad_norm": 0.5210872888565063,
+ "learning_rate": 3.2903659727435692e-06,
+ "loss": 0.4622,
+ "step": 566
+ },
+ {
+ "epoch": 2.6724137931034484,
+ "grad_norm": 0.5768873691558838,
+ "learning_rate": 3.284005500739423e-06,
+ "loss": 0.4556,
+ "step": 567
+ },
+ {
+ "epoch": 2.677115987460815,
+ "grad_norm": 0.5305764675140381,
+ "learning_rate": 3.2776393954289e-06,
+ "loss": 0.429,
+ "step": 568
+ },
+ {
+ "epoch": 2.6818181818181817,
+ "grad_norm": 0.5312129855155945,
+ "learning_rate": 3.271267702554307e-06,
+ "loss": 0.4208,
+ "step": 569
+ },
+ {
+ "epoch": 2.6865203761755487,
+ "grad_norm": 0.5433884859085083,
+ "learning_rate": 3.2648904678981032e-06,
+ "loss": 0.4647,
+ "step": 570
+ },
+ {
+ "epoch": 2.6912225705329154,
+ "grad_norm": 1.2331725358963013,
+ "learning_rate": 3.2585077372825636e-06,
+ "loss": 0.4126,
+ "step": 571
+ },
+ {
+ "epoch": 2.695924764890282,
+ "grad_norm": 0.5495198369026184,
+ "learning_rate": 3.2521195565694543e-06,
+ "loss": 0.4453,
+ "step": 572
+ },
+ {
+ "epoch": 2.700626959247649,
+ "grad_norm": 0.5230907201766968,
+ "learning_rate": 3.2457259716597023e-06,
+ "loss": 0.446,
+ "step": 573
+ },
+ {
+ "epoch": 2.7053291536050157,
+ "grad_norm": 0.4807503819465637,
+ "learning_rate": 3.2393270284930658e-06,
+ "loss": 0.4547,
+ "step": 574
+ },
+ {
+ "epoch": 2.7100313479623823,
+ "grad_norm": 0.5169614553451538,
+ "learning_rate": 3.2329227730478026e-06,
+ "loss": 0.4319,
+ "step": 575
+ },
+ {
+ "epoch": 2.714733542319749,
+ "grad_norm": 0.502966046333313,
+ "learning_rate": 3.2265132513403415e-06,
+ "loss": 0.4196,
+ "step": 576
+ },
+ {
+ "epoch": 2.719435736677116,
+ "grad_norm": 0.5387672781944275,
+ "learning_rate": 3.22009850942495e-06,
+ "loss": 0.4449,
+ "step": 577
+ },
+ {
+ "epoch": 2.7241379310344827,
+ "grad_norm": 0.5503709316253662,
+ "learning_rate": 3.213678593393405e-06,
+ "loss": 0.4589,
+ "step": 578
+ },
+ {
+ "epoch": 2.7288401253918497,
+ "grad_norm": 0.5165039300918579,
+ "learning_rate": 3.207253549374662e-06,
+ "loss": 0.4578,
+ "step": 579
+ },
+ {
+ "epoch": 2.7335423197492164,
+ "grad_norm": 0.5894023180007935,
+ "learning_rate": 3.200823423534519e-06,
+ "loss": 0.4448,
+ "step": 580
+ },
+ {
+ "epoch": 2.738244514106583,
+ "grad_norm": 0.5234156250953674,
+ "learning_rate": 3.194388262075293e-06,
+ "loss": 0.4504,
+ "step": 581
+ },
+ {
+ "epoch": 2.7429467084639496,
+ "grad_norm": 0.47498077154159546,
+ "learning_rate": 3.1879481112354804e-06,
+ "loss": 0.4471,
+ "step": 582
+ },
+ {
+ "epoch": 2.7476489028213167,
+ "grad_norm": 0.5213322043418884,
+ "learning_rate": 3.181503017289428e-06,
+ "loss": 0.4096,
+ "step": 583
+ },
+ {
+ "epoch": 2.7523510971786833,
+ "grad_norm": 0.5031464695930481,
+ "learning_rate": 3.175053026547002e-06,
+ "loss": 0.416,
+ "step": 584
+ },
+ {
+ "epoch": 2.7570532915360504,
+ "grad_norm": 0.7983574867248535,
+ "learning_rate": 3.16859818535325e-06,
+ "loss": 0.457,
+ "step": 585
+ },
+ {
+ "epoch": 2.761755485893417,
+ "grad_norm": 0.47774994373321533,
+ "learning_rate": 3.1621385400880756e-06,
+ "loss": 0.4529,
+ "step": 586
+ },
+ {
+ "epoch": 2.7664576802507836,
+ "grad_norm": 0.8216882348060608,
+ "learning_rate": 3.1556741371658984e-06,
+ "loss": 0.4559,
+ "step": 587
+ },
+ {
+ "epoch": 2.7711598746081503,
+ "grad_norm": 0.5124049186706543,
+ "learning_rate": 3.1492050230353238e-06,
+ "loss": 0.4438,
+ "step": 588
+ },
+ {
+ "epoch": 2.7758620689655173,
+ "grad_norm": 0.5410915017127991,
+ "learning_rate": 3.142731244178809e-06,
+ "loss": 0.4195,
+ "step": 589
+ },
+ {
+ "epoch": 2.780564263322884,
+ "grad_norm": 0.5318175554275513,
+ "learning_rate": 3.1362528471123277e-06,
+ "loss": 0.4046,
+ "step": 590
+ },
+ {
+ "epoch": 2.785266457680251,
+ "grad_norm": 0.6133676171302795,
+ "learning_rate": 3.129769878385039e-06,
+ "loss": 0.4098,
+ "step": 591
+ },
+ {
+ "epoch": 2.7899686520376177,
+ "grad_norm": 0.4698888063430786,
+ "learning_rate": 3.1232823845789473e-06,
+ "loss": 0.4508,
+ "step": 592
+ },
+ {
+ "epoch": 2.7946708463949843,
+ "grad_norm": 0.6980767250061035,
+ "learning_rate": 3.1167904123085736e-06,
+ "loss": 0.455,
+ "step": 593
+ },
+ {
+ "epoch": 2.799373040752351,
+ "grad_norm": 0.5151284337043762,
+ "learning_rate": 3.110294008220617e-06,
+ "loss": 0.4431,
+ "step": 594
+ },
+ {
+ "epoch": 2.804075235109718,
+ "grad_norm": 0.47901320457458496,
+ "learning_rate": 3.1037932189936205e-06,
+ "loss": 0.4406,
+ "step": 595
+ },
+ {
+ "epoch": 2.8087774294670846,
+ "grad_norm": 0.5079891085624695,
+ "learning_rate": 3.097288091337635e-06,
+ "loss": 0.4351,
+ "step": 596
+ },
+ {
+ "epoch": 2.8134796238244513,
+ "grad_norm": 0.5278874635696411,
+ "learning_rate": 3.0907786719938876e-06,
+ "loss": 0.4264,
+ "step": 597
+ },
+ {
+ "epoch": 2.8181818181818183,
+ "grad_norm": 0.47123396396636963,
+ "learning_rate": 3.084265007734436e-06,
+ "loss": 0.434,
+ "step": 598
+ },
+ {
+ "epoch": 2.822884012539185,
+ "grad_norm": 0.5229635834693909,
+ "learning_rate": 3.0777471453618457e-06,
+ "loss": 0.4602,
+ "step": 599
+ },
+ {
+ "epoch": 2.8275862068965516,
+ "grad_norm": 0.47847074270248413,
+ "learning_rate": 3.0712251317088426e-06,
+ "loss": 0.4317,
+ "step": 600
+ },
+ {
+ "epoch": 2.8322884012539182,
+ "grad_norm": 0.7754543423652649,
+ "learning_rate": 3.064699013637983e-06,
+ "loss": 0.4528,
+ "step": 601
+ },
+ {
+ "epoch": 2.8369905956112853,
+ "grad_norm": 0.5581084489822388,
+ "learning_rate": 3.0581688380413115e-06,
+ "loss": 0.4369,
+ "step": 602
+ },
+ {
+ "epoch": 2.841692789968652,
+ "grad_norm": 0.588622510433197,
+ "learning_rate": 3.0516346518400315e-06,
+ "loss": 0.4517,
+ "step": 603
+ },
+ {
+ "epoch": 2.846394984326019,
+ "grad_norm": 0.565423846244812,
+ "learning_rate": 3.0450965019841593e-06,
+ "loss": 0.4517,
+ "step": 604
+ },
+ {
+ "epoch": 2.8510971786833856,
+ "grad_norm": 0.47801777720451355,
+ "learning_rate": 3.0385544354521957e-06,
+ "loss": 0.4161,
+ "step": 605
+ },
+ {
+ "epoch": 2.8557993730407523,
+ "grad_norm": 0.5034862756729126,
+ "learning_rate": 3.0320084992507814e-06,
+ "loss": 0.4428,
+ "step": 606
+ },
+ {
+ "epoch": 2.860501567398119,
+ "grad_norm": 0.5339663624763489,
+ "learning_rate": 3.0254587404143604e-06,
+ "loss": 0.4792,
+ "step": 607
+ },
+ {
+ "epoch": 2.865203761755486,
+ "grad_norm": 0.48184943199157715,
+ "learning_rate": 3.0189052060048464e-06,
+ "loss": 0.4409,
+ "step": 608
+ },
+ {
+ "epoch": 2.8699059561128526,
+ "grad_norm": 0.5102176070213318,
+ "learning_rate": 3.01234794311128e-06,
+ "loss": 0.438,
+ "step": 609
+ },
+ {
+ "epoch": 2.8746081504702197,
+ "grad_norm": 0.5111781358718872,
+ "learning_rate": 3.0057869988494925e-06,
+ "loss": 0.4617,
+ "step": 610
+ },
+ {
+ "epoch": 2.8793103448275863,
+ "grad_norm": 0.5915101766586304,
+ "learning_rate": 2.999222420361767e-06,
+ "loss": 0.4532,
+ "step": 611
+ },
+ {
+ "epoch": 2.884012539184953,
+ "grad_norm": 0.48898932337760925,
+ "learning_rate": 2.9926542548165e-06,
+ "loss": 0.4663,
+ "step": 612
+ },
+ {
+ "epoch": 2.8887147335423196,
+ "grad_norm": 0.4943861961364746,
+ "learning_rate": 2.9860825494078605e-06,
+ "loss": 0.4354,
+ "step": 613
+ },
+ {
+ "epoch": 2.8934169278996866,
+ "grad_norm": 0.5398025512695312,
+ "learning_rate": 2.979507351355454e-06,
+ "loss": 0.4546,
+ "step": 614
+ },
+ {
+ "epoch": 2.8981191222570533,
+ "grad_norm": 0.545421302318573,
+ "learning_rate": 2.972928707903981e-06,
+ "loss": 0.4404,
+ "step": 615
+ },
+ {
+ "epoch": 2.9028213166144203,
+ "grad_norm": 0.5370550751686096,
+ "learning_rate": 2.966346666322898e-06,
+ "loss": 0.4401,
+ "step": 616
+ },
+ {
+ "epoch": 2.907523510971787,
+ "grad_norm": 0.5280672311782837,
+ "learning_rate": 2.9597612739060775e-06,
+ "loss": 0.4172,
+ "step": 617
+ },
+ {
+ "epoch": 2.9122257053291536,
+ "grad_norm": 0.5043423175811768,
+ "learning_rate": 2.9531725779714713e-06,
+ "loss": 0.4487,
+ "step": 618
+ },
+ {
+ "epoch": 2.91692789968652,
+ "grad_norm": 1.961200475692749,
+ "learning_rate": 2.9465806258607653e-06,
+ "loss": 0.4548,
+ "step": 619
+ },
+ {
+ "epoch": 2.9216300940438873,
+ "grad_norm": 0.5286726355552673,
+ "learning_rate": 2.939985464939043e-06,
+ "loss": 0.4566,
+ "step": 620
+ },
+ {
+ "epoch": 2.926332288401254,
+ "grad_norm": 0.5209453105926514,
+ "learning_rate": 2.9333871425944434e-06,
+ "loss": 0.4064,
+ "step": 621
+ },
+ {
+ "epoch": 2.9310344827586206,
+ "grad_norm": 0.47711747884750366,
+ "learning_rate": 2.926785706237822e-06,
+ "loss": 0.4341,
+ "step": 622
+ },
+ {
+ "epoch": 2.9357366771159876,
+ "grad_norm": 0.45926427841186523,
+ "learning_rate": 2.920181203302409e-06,
+ "loss": 0.4256,
+ "step": 623
+ },
+ {
+ "epoch": 2.9404388714733543,
+ "grad_norm": 0.5624600648880005,
+ "learning_rate": 2.91357368124347e-06,
+ "loss": 0.4252,
+ "step": 624
+ },
+ {
+ "epoch": 2.945141065830721,
+ "grad_norm": 0.5101850628852844,
+ "learning_rate": 2.906963187537962e-06,
+ "loss": 0.4352,
+ "step": 625
+ },
+ {
+ "epoch": 2.9498432601880875,
+ "grad_norm": 0.5341358184814453,
+ "learning_rate": 2.9003497696841955e-06,
+ "loss": 0.4132,
+ "step": 626
+ },
+ {
+ "epoch": 2.9545454545454546,
+ "grad_norm": 0.5917084217071533,
+ "learning_rate": 2.8937334752014913e-06,
+ "loss": 0.4693,
+ "step": 627
+ },
+ {
+ "epoch": 2.959247648902821,
+ "grad_norm": 0.793695330619812,
+ "learning_rate": 2.887114351629839e-06,
+ "loss": 0.4431,
+ "step": 628
+ },
+ {
+ "epoch": 2.9639498432601883,
+ "grad_norm": 0.5363728404045105,
+ "learning_rate": 2.8804924465295575e-06,
+ "loss": 0.4672,
+ "step": 629
+ },
+ {
+ "epoch": 2.968652037617555,
+ "grad_norm": 0.4979572892189026,
+ "learning_rate": 2.873867807480951e-06,
+ "loss": 0.4723,
+ "step": 630
+ },
+ {
+ "epoch": 2.9733542319749215,
+ "grad_norm": 0.5310130715370178,
+ "learning_rate": 2.8672404820839676e-06,
+ "loss": 0.4388,
+ "step": 631
+ },
+ {
+ "epoch": 2.978056426332288,
+ "grad_norm": 0.530015766620636,
+ "learning_rate": 2.8606105179578584e-06,
+ "loss": 0.4466,
+ "step": 632
+ },
+ {
+ "epoch": 2.9827586206896552,
+ "grad_norm": 0.5356627702713013,
+ "learning_rate": 2.8539779627408332e-06,
+ "loss": 0.4252,
+ "step": 633
+ },
+ {
+ "epoch": 2.987460815047022,
+ "grad_norm": 0.5290245413780212,
+ "learning_rate": 2.847342864089721e-06,
+ "loss": 0.4453,
+ "step": 634
+ },
+ {
+ "epoch": 2.992163009404389,
+ "grad_norm": 0.471682071685791,
+ "learning_rate": 2.8407052696796255e-06,
+ "loss": 0.43,
+ "step": 635
+ },
+ {
+ "epoch": 2.9968652037617556,
+ "grad_norm": 0.5220829844474792,
+ "learning_rate": 2.834065227203584e-06,
+ "loss": 0.4494,
+ "step": 636
+ },
+ {
+ "epoch": 3.0047021943573666,
+ "grad_norm": 0.4797399342060089,
+ "learning_rate": 2.8274227843722213e-06,
+ "loss": 0.8683,
+ "step": 637
+ },
+ {
+ "epoch": 3.0094043887147337,
+ "grad_norm": 0.5463248491287231,
+ "learning_rate": 2.820777988913412e-06,
+ "loss": 0.4157,
+ "step": 638
+ },
+ {
+ "epoch": 3.0141065830721003,
+ "grad_norm": 0.5081924200057983,
+ "learning_rate": 2.8141308885719337e-06,
+ "loss": 0.4169,
+ "step": 639
+ },
+ {
+ "epoch": 3.018808777429467,
+ "grad_norm": 0.4916677474975586,
+ "learning_rate": 2.8074815311091265e-06,
+ "loss": 0.3898,
+ "step": 640
+ },
+ {
+ "epoch": 3.023510971786834,
+ "grad_norm": 0.48858827352523804,
+ "learning_rate": 2.8008299643025477e-06,
+ "loss": 0.4319,
+ "step": 641
+ },
+ {
+ "epoch": 3.0282131661442007,
+ "grad_norm": 0.49183058738708496,
+ "learning_rate": 2.7941762359456294e-06,
+ "loss": 0.4243,
+ "step": 642
+ },
+ {
+ "epoch": 3.0329153605015673,
+ "grad_norm": 0.5068245530128479,
+ "learning_rate": 2.787520393847334e-06,
+ "loss": 0.4168,
+ "step": 643
+ },
+ {
+ "epoch": 3.0376175548589344,
+ "grad_norm": 0.542245090007782,
+ "learning_rate": 2.780862485831814e-06,
+ "loss": 0.4289,
+ "step": 644
+ },
+ {
+ "epoch": 3.042319749216301,
+ "grad_norm": 0.49114999175071716,
+ "learning_rate": 2.7742025597380644e-06,
+ "loss": 0.4337,
+ "step": 645
+ },
+ {
+ "epoch": 3.0470219435736676,
+ "grad_norm": 0.4982999563217163,
+ "learning_rate": 2.7675406634195824e-06,
+ "loss": 0.4207,
+ "step": 646
+ },
+ {
+ "epoch": 3.0517241379310347,
+ "grad_norm": 0.5352709293365479,
+ "learning_rate": 2.7608768447440193e-06,
+ "loss": 0.4087,
+ "step": 647
+ },
+ {
+ "epoch": 3.0564263322884013,
+ "grad_norm": 0.5486279726028442,
+ "learning_rate": 2.754211151592841e-06,
+ "loss": 0.4129,
+ "step": 648
+ },
+ {
+ "epoch": 3.061128526645768,
+ "grad_norm": 0.6048034429550171,
+ "learning_rate": 2.7475436318609827e-06,
+ "loss": 0.433,
+ "step": 649
+ },
+ {
+ "epoch": 3.0658307210031346,
+ "grad_norm": 0.6576470136642456,
+ "learning_rate": 2.7408743334565006e-06,
+ "loss": 0.4086,
+ "step": 650
+ },
+ {
+ "epoch": 3.0705329153605017,
+ "grad_norm": 0.49989938735961914,
+ "learning_rate": 2.734203304300235e-06,
+ "loss": 0.3999,
+ "step": 651
+ },
+ {
+ "epoch": 3.0752351097178683,
+ "grad_norm": 0.5238141417503357,
+ "learning_rate": 2.7275305923254607e-06,
+ "loss": 0.4133,
+ "step": 652
+ },
+ {
+ "epoch": 3.079937304075235,
+ "grad_norm": 0.5244804620742798,
+ "learning_rate": 2.720856245477544e-06,
+ "loss": 0.4016,
+ "step": 653
+ },
+ {
+ "epoch": 3.084639498432602,
+ "grad_norm": 0.5036159753799438,
+ "learning_rate": 2.7141803117135978e-06,
+ "loss": 0.3972,
+ "step": 654
+ },
+ {
+ "epoch": 3.0893416927899686,
+ "grad_norm": 0.5390443801879883,
+ "learning_rate": 2.7075028390021385e-06,
+ "loss": 0.3992,
+ "step": 655
+ },
+ {
+ "epoch": 3.0940438871473352,
+ "grad_norm": 0.5226757526397705,
+ "learning_rate": 2.7008238753227385e-06,
+ "loss": 0.4074,
+ "step": 656
+ },
+ {
+ "epoch": 3.0987460815047023,
+ "grad_norm": 0.48386913537979126,
+ "learning_rate": 2.694143468665685e-06,
+ "loss": 0.4284,
+ "step": 657
+ },
+ {
+ "epoch": 3.103448275862069,
+ "grad_norm": 0.5081993341445923,
+ "learning_rate": 2.6874616670316338e-06,
+ "loss": 0.3952,
+ "step": 658
+ },
+ {
+ "epoch": 3.1081504702194356,
+ "grad_norm": 0.538280189037323,
+ "learning_rate": 2.6807785184312618e-06,
+ "loss": 0.4136,
+ "step": 659
+ },
+ {
+ "epoch": 3.1128526645768027,
+ "grad_norm": 0.7804566621780396,
+ "learning_rate": 2.674094070884926e-06,
+ "loss": 0.4131,
+ "step": 660
+ },
+ {
+ "epoch": 3.1175548589341693,
+ "grad_norm": 0.6693199872970581,
+ "learning_rate": 2.6674083724223166e-06,
+ "loss": 0.4329,
+ "step": 661
+ },
+ {
+ "epoch": 3.122257053291536,
+ "grad_norm": 0.5034769773483276,
+ "learning_rate": 2.6607214710821112e-06,
+ "loss": 0.4062,
+ "step": 662
+ },
+ {
+ "epoch": 3.126959247648903,
+ "grad_norm": 0.5518231391906738,
+ "learning_rate": 2.6540334149116304e-06,
+ "loss": 0.4172,
+ "step": 663
+ },
+ {
+ "epoch": 3.1316614420062696,
+ "grad_norm": 0.5797336101531982,
+ "learning_rate": 2.647344251966493e-06,
+ "loss": 0.4164,
+ "step": 664
+ },
+ {
+ "epoch": 3.1363636363636362,
+ "grad_norm": 0.5404736399650574,
+ "learning_rate": 2.6406540303102714e-06,
+ "loss": 0.4157,
+ "step": 665
+ },
+ {
+ "epoch": 3.1410658307210033,
+ "grad_norm": 0.5246729850769043,
+ "learning_rate": 2.6339627980141425e-06,
+ "loss": 0.4165,
+ "step": 666
+ },
+ {
+ "epoch": 3.14576802507837,
+ "grad_norm": 0.5443553328514099,
+ "learning_rate": 2.6272706031565482e-06,
+ "loss": 0.4022,
+ "step": 667
+ },
+ {
+ "epoch": 3.1504702194357366,
+ "grad_norm": 0.5127459168434143,
+ "learning_rate": 2.6205774938228433e-06,
+ "loss": 0.3983,
+ "step": 668
+ },
+ {
+ "epoch": 3.1551724137931036,
+ "grad_norm": 0.5095480680465698,
+ "learning_rate": 2.6138835181049556e-06,
+ "loss": 0.4227,
+ "step": 669
+ },
+ {
+ "epoch": 3.1598746081504703,
+ "grad_norm": 0.5238015651702881,
+ "learning_rate": 2.6071887241010374e-06,
+ "loss": 0.4056,
+ "step": 670
+ },
+ {
+ "epoch": 3.164576802507837,
+ "grad_norm": 0.5659390687942505,
+ "learning_rate": 2.6004931599151223e-06,
+ "loss": 0.3933,
+ "step": 671
+ },
+ {
+ "epoch": 3.169278996865204,
+ "grad_norm": 0.528191328048706,
+ "learning_rate": 2.593796873656775e-06,
+ "loss": 0.4356,
+ "step": 672
+ },
+ {
+ "epoch": 3.1739811912225706,
+ "grad_norm": 1.1774086952209473,
+ "learning_rate": 2.587099913440749e-06,
+ "loss": 0.4149,
+ "step": 673
+ },
+ {
+ "epoch": 3.1786833855799372,
+ "grad_norm": 0.5629571676254272,
+ "learning_rate": 2.580402327386643e-06,
+ "loss": 0.403,
+ "step": 674
+ },
+ {
+ "epoch": 3.183385579937304,
+ "grad_norm": 1.1260513067245483,
+ "learning_rate": 2.5737041636185496e-06,
+ "loss": 0.4102,
+ "step": 675
+ },
+ {
+ "epoch": 3.188087774294671,
+ "grad_norm": 0.6467511653900146,
+ "learning_rate": 2.5670054702647146e-06,
+ "loss": 0.3948,
+ "step": 676
+ },
+ {
+ "epoch": 3.1927899686520376,
+ "grad_norm": 0.5177720785140991,
+ "learning_rate": 2.5603062954571872e-06,
+ "loss": 0.4188,
+ "step": 677
+ },
+ {
+ "epoch": 3.197492163009404,
+ "grad_norm": 0.5086417198181152,
+ "learning_rate": 2.553606687331477e-06,
+ "loss": 0.4403,
+ "step": 678
+ },
+ {
+ "epoch": 3.2021943573667713,
+ "grad_norm": 0.5762012600898743,
+ "learning_rate": 2.5469066940262073e-06,
+ "loss": 0.4084,
+ "step": 679
+ },
+ {
+ "epoch": 3.206896551724138,
+ "grad_norm": 0.5122736692428589,
+ "learning_rate": 2.540206363682768e-06,
+ "loss": 0.4005,
+ "step": 680
+ },
+ {
+ "epoch": 3.2115987460815045,
+ "grad_norm": 0.5179394483566284,
+ "learning_rate": 2.533505744444972e-06,
+ "loss": 0.419,
+ "step": 681
+ },
+ {
+ "epoch": 3.2163009404388716,
+ "grad_norm": 0.5541443824768066,
+ "learning_rate": 2.526804884458707e-06,
+ "loss": 0.4112,
+ "step": 682
+ },
+ {
+ "epoch": 3.2210031347962382,
+ "grad_norm": 0.5687317252159119,
+ "learning_rate": 2.520103831871591e-06,
+ "loss": 0.4145,
+ "step": 683
+ },
+ {
+ "epoch": 3.225705329153605,
+ "grad_norm": 0.5060294270515442,
+ "learning_rate": 2.513402634832627e-06,
+ "loss": 0.3933,
+ "step": 684
+ },
+ {
+ "epoch": 3.230407523510972,
+ "grad_norm": 0.6311008930206299,
+ "learning_rate": 2.5067013414918523e-06,
+ "loss": 0.401,
+ "step": 685
+ },
+ {
+ "epoch": 3.2351097178683386,
+ "grad_norm": 0.5575832724571228,
+ "learning_rate": 2.5e-06,
+ "loss": 0.4127,
+ "step": 686
+ },
+ {
+ "epoch": 3.239811912225705,
+ "grad_norm": 0.5105507373809814,
+ "learning_rate": 2.493298658508149e-06,
+ "loss": 0.3971,
+ "step": 687
+ },
+ {
+ "epoch": 3.2445141065830723,
+ "grad_norm": 0.5813129544258118,
+ "learning_rate": 2.4865973651673743e-06,
+ "loss": 0.4136,
+ "step": 688
+ },
+ {
+ "epoch": 3.249216300940439,
+ "grad_norm": 0.5921242833137512,
+ "learning_rate": 2.4798961681284096e-06,
+ "loss": 0.437,
+ "step": 689
+ },
+ {
+ "epoch": 3.2539184952978055,
+ "grad_norm": 0.5654864311218262,
+ "learning_rate": 2.473195115541293e-06,
+ "loss": 0.3939,
+ "step": 690
+ },
+ {
+ "epoch": 3.2586206896551726,
+ "grad_norm": 0.5103882551193237,
+ "learning_rate": 2.466494255555029e-06,
+ "loss": 0.4394,
+ "step": 691
+ },
+ {
+ "epoch": 3.2633228840125392,
+ "grad_norm": 0.5423967242240906,
+ "learning_rate": 2.459793636317233e-06,
+ "loss": 0.4048,
+ "step": 692
+ },
+ {
+ "epoch": 3.268025078369906,
+ "grad_norm": 0.6185951828956604,
+ "learning_rate": 2.4530933059737936e-06,
+ "loss": 0.4432,
+ "step": 693
+ },
+ {
+ "epoch": 3.2727272727272725,
+ "grad_norm": 0.6062753796577454,
+ "learning_rate": 2.4463933126685236e-06,
+ "loss": 0.4061,
+ "step": 694
+ },
+ {
+ "epoch": 3.2774294670846396,
+ "grad_norm": 0.5118281841278076,
+ "learning_rate": 2.439693704542814e-06,
+ "loss": 0.4008,
+ "step": 695
+ },
+ {
+ "epoch": 3.282131661442006,
+ "grad_norm": 0.9080231785774231,
+ "learning_rate": 2.432994529735286e-06,
+ "loss": 0.409,
+ "step": 696
+ },
+ {
+ "epoch": 3.2868338557993733,
+ "grad_norm": 0.550635814666748,
+ "learning_rate": 2.4262958363814512e-06,
+ "loss": 0.4202,
+ "step": 697
+ },
+ {
+ "epoch": 3.29153605015674,
+ "grad_norm": 0.5728116631507874,
+ "learning_rate": 2.4195976726133574e-06,
+ "loss": 0.406,
+ "step": 698
+ },
+ {
+ "epoch": 3.2962382445141065,
+ "grad_norm": 0.4995472729206085,
+ "learning_rate": 2.4129000865592517e-06,
+ "loss": 0.4063,
+ "step": 699
+ },
+ {
+ "epoch": 3.300940438871473,
+ "grad_norm": 0.601259708404541,
+ "learning_rate": 2.4062031263432267e-06,
+ "loss": 0.4268,
+ "step": 700
+ },
+ {
+ "epoch": 3.30564263322884,
+ "grad_norm": 0.570606529712677,
+ "learning_rate": 2.3995068400848785e-06,
+ "loss": 0.4034,
+ "step": 701
+ },
+ {
+ "epoch": 3.310344827586207,
+ "grad_norm": 0.5638160705566406,
+ "learning_rate": 2.392811275898963e-06,
+ "loss": 0.4212,
+ "step": 702
+ },
+ {
+ "epoch": 3.3150470219435735,
+ "grad_norm": 0.5354572534561157,
+ "learning_rate": 2.3861164818950448e-06,
+ "loss": 0.3893,
+ "step": 703
+ },
+ {
+ "epoch": 3.3197492163009406,
+ "grad_norm": 0.5149163603782654,
+ "learning_rate": 2.379422506177157e-06,
+ "loss": 0.4126,
+ "step": 704
+ },
+ {
+ "epoch": 3.324451410658307,
+ "grad_norm": 0.5132194757461548,
+ "learning_rate": 2.372729396843453e-06,
+ "loss": 0.4132,
+ "step": 705
+ },
+ {
+ "epoch": 3.329153605015674,
+ "grad_norm": 0.5163543224334717,
+ "learning_rate": 2.366037201985858e-06,
+ "loss": 0.418,
+ "step": 706
+ },
+ {
+ "epoch": 3.333855799373041,
+ "grad_norm": 0.5132508277893066,
+ "learning_rate": 2.3593459696897294e-06,
+ "loss": 0.3944,
+ "step": 707
+ },
+ {
+ "epoch": 3.3385579937304075,
+ "grad_norm": 0.5490009188652039,
+ "learning_rate": 2.352655748033508e-06,
+ "loss": 0.414,
+ "step": 708
+ },
+ {
+ "epoch": 3.343260188087774,
+ "grad_norm": 0.5879104733467102,
+ "learning_rate": 2.3459665850883704e-06,
+ "loss": 0.4344,
+ "step": 709
+ },
+ {
+ "epoch": 3.347962382445141,
+ "grad_norm": 0.5451306700706482,
+ "learning_rate": 2.33927852891789e-06,
+ "loss": 0.4208,
+ "step": 710
+ },
+ {
+ "epoch": 3.352664576802508,
+ "grad_norm": 0.5207070708274841,
+ "learning_rate": 2.3325916275776834e-06,
+ "loss": 0.4398,
+ "step": 711
+ },
+ {
+ "epoch": 3.3573667711598745,
+ "grad_norm": 0.5440477132797241,
+ "learning_rate": 2.3259059291150744e-06,
+ "loss": 0.4015,
+ "step": 712
+ },
+ {
+ "epoch": 3.3620689655172415,
+ "grad_norm": 0.5619958639144897,
+ "learning_rate": 2.319221481568739e-06,
+ "loss": 0.4196,
+ "step": 713
+ },
+ {
+ "epoch": 3.366771159874608,
+ "grad_norm": 0.6007470488548279,
+ "learning_rate": 2.3125383329683666e-06,
+ "loss": 0.4217,
+ "step": 714
+ },
+ {
+ "epoch": 3.371473354231975,
+ "grad_norm": 0.4972032904624939,
+ "learning_rate": 2.3058565313343152e-06,
+ "loss": 0.3904,
+ "step": 715
+ },
+ {
+ "epoch": 3.376175548589342,
+ "grad_norm": 0.5420966148376465,
+ "learning_rate": 2.2991761246772623e-06,
+ "loss": 0.4048,
+ "step": 716
+ },
+ {
+ "epoch": 3.3808777429467085,
+ "grad_norm": 0.520063042640686,
+ "learning_rate": 2.2924971609978623e-06,
+ "loss": 0.3965,
+ "step": 717
+ },
+ {
+ "epoch": 3.385579937304075,
+ "grad_norm": 0.8903913497924805,
+ "learning_rate": 2.285819688286403e-06,
+ "loss": 0.3873,
+ "step": 718
+ },
+ {
+ "epoch": 3.3902821316614418,
+ "grad_norm": 0.5380633473396301,
+ "learning_rate": 2.2791437545224563e-06,
+ "loss": 0.4335,
+ "step": 719
+ },
+ {
+ "epoch": 3.394984326018809,
+ "grad_norm": 0.5058356523513794,
+ "learning_rate": 2.2724694076745397e-06,
+ "loss": 0.4134,
+ "step": 720
+ },
+ {
+ "epoch": 3.3996865203761755,
+ "grad_norm": 0.5383400321006775,
+ "learning_rate": 2.265796695699766e-06,
+ "loss": 0.4154,
+ "step": 721
+ },
+ {
+ "epoch": 3.4043887147335425,
+ "grad_norm": 0.5831345319747925,
+ "learning_rate": 2.2591256665434998e-06,
+ "loss": 0.4193,
+ "step": 722
+ },
+ {
+ "epoch": 3.409090909090909,
+ "grad_norm": 0.5494023561477661,
+ "learning_rate": 2.252456368139019e-06,
+ "loss": 0.4137,
+ "step": 723
+ },
+ {
+ "epoch": 3.413793103448276,
+ "grad_norm": 0.5735755562782288,
+ "learning_rate": 2.245788848407159e-06,
+ "loss": 0.4211,
+ "step": 724
+ },
+ {
+ "epoch": 3.4184952978056424,
+ "grad_norm": 0.5244953036308289,
+ "learning_rate": 2.2391231552559815e-06,
+ "loss": 0.4194,
+ "step": 725
+ },
+ {
+ "epoch": 3.4231974921630095,
+ "grad_norm": 0.5803194642066956,
+ "learning_rate": 2.2324593365804184e-06,
+ "loss": 0.3882,
+ "step": 726
+ },
+ {
+ "epoch": 3.427899686520376,
+ "grad_norm": 0.5303656458854675,
+ "learning_rate": 2.225797440261936e-06,
+ "loss": 0.4336,
+ "step": 727
+ },
+ {
+ "epoch": 3.4326018808777428,
+ "grad_norm": 0.6270896792411804,
+ "learning_rate": 2.219137514168187e-06,
+ "loss": 0.397,
+ "step": 728
+ },
+ {
+ "epoch": 3.43730407523511,
+ "grad_norm": 0.5054409503936768,
+ "learning_rate": 2.212479606152667e-06,
+ "loss": 0.4261,
+ "step": 729
+ },
+ {
+ "epoch": 3.4420062695924765,
+ "grad_norm": 0.5422618985176086,
+ "learning_rate": 2.205823764054372e-06,
+ "loss": 0.4105,
+ "step": 730
+ },
+ {
+ "epoch": 3.446708463949843,
+ "grad_norm": 0.5200968980789185,
+ "learning_rate": 2.199170035697453e-06,
+ "loss": 0.4048,
+ "step": 731
+ },
+ {
+ "epoch": 3.45141065830721,
+ "grad_norm": 0.5316998362541199,
+ "learning_rate": 2.1925184688908735e-06,
+ "loss": 0.4132,
+ "step": 732
+ },
+ {
+ "epoch": 3.456112852664577,
+ "grad_norm": 0.5780388116836548,
+ "learning_rate": 2.185869111428067e-06,
+ "loss": 0.4381,
+ "step": 733
+ },
+ {
+ "epoch": 3.4608150470219434,
+ "grad_norm": 0.5547174215316772,
+ "learning_rate": 2.1792220110865885e-06,
+ "loss": 0.4236,
+ "step": 734
+ },
+ {
+ "epoch": 3.4655172413793105,
+ "grad_norm": 0.5188453197479248,
+ "learning_rate": 2.1725772156277795e-06,
+ "loss": 0.4052,
+ "step": 735
+ },
+ {
+ "epoch": 3.470219435736677,
+ "grad_norm": 0.5145602822303772,
+ "learning_rate": 2.165934772796417e-06,
+ "loss": 0.412,
+ "step": 736
+ },
+ {
+ "epoch": 3.4749216300940438,
+ "grad_norm": 0.5960094332695007,
+ "learning_rate": 2.159294730320374e-06,
+ "loss": 0.426,
+ "step": 737
+ },
+ {
+ "epoch": 3.479623824451411,
+ "grad_norm": 0.7090360522270203,
+ "learning_rate": 2.15265713591028e-06,
+ "loss": 0.4133,
+ "step": 738
+ },
+ {
+ "epoch": 3.4843260188087775,
+ "grad_norm": 0.5428952574729919,
+ "learning_rate": 2.1460220372591676e-06,
+ "loss": 0.4332,
+ "step": 739
+ },
+ {
+ "epoch": 3.489028213166144,
+ "grad_norm": 0.6610196232795715,
+ "learning_rate": 2.139389482042142e-06,
+ "loss": 0.3985,
+ "step": 740
+ },
+ {
+ "epoch": 3.493730407523511,
+ "grad_norm": 0.5409770607948303,
+ "learning_rate": 2.1327595179160332e-06,
+ "loss": 0.4148,
+ "step": 741
+ },
+ {
+ "epoch": 3.498432601880878,
+ "grad_norm": 0.8822159171104431,
+ "learning_rate": 2.1261321925190492e-06,
+ "loss": 0.4071,
+ "step": 742
+ },
+ {
+ "epoch": 3.5031347962382444,
+ "grad_norm": 0.5366957783699036,
+ "learning_rate": 2.1195075534704433e-06,
+ "loss": 0.3838,
+ "step": 743
+ },
+ {
+ "epoch": 3.507836990595611,
+ "grad_norm": 0.5289701819419861,
+ "learning_rate": 2.1128856483701625e-06,
+ "loss": 0.4123,
+ "step": 744
+ },
+ {
+ "epoch": 3.512539184952978,
+ "grad_norm": 0.5737835764884949,
+ "learning_rate": 2.10626652479851e-06,
+ "loss": 0.392,
+ "step": 745
+ },
+ {
+ "epoch": 3.5172413793103448,
+ "grad_norm": 0.5381962060928345,
+ "learning_rate": 2.0996502303158057e-06,
+ "loss": 0.4088,
+ "step": 746
+ },
+ {
+ "epoch": 3.521943573667712,
+ "grad_norm": 0.529466450214386,
+ "learning_rate": 2.0930368124620385e-06,
+ "loss": 0.4098,
+ "step": 747
+ },
+ {
+ "epoch": 3.5266457680250785,
+ "grad_norm": 0.6686971783638,
+ "learning_rate": 2.086426318756531e-06,
+ "loss": 0.4273,
+ "step": 748
+ },
+ {
+ "epoch": 3.531347962382445,
+ "grad_norm": 0.5246966481208801,
+ "learning_rate": 2.0798187966975917e-06,
+ "loss": 0.4318,
+ "step": 749
+ },
+ {
+ "epoch": 3.5360501567398117,
+ "grad_norm": 0.5165736675262451,
+ "learning_rate": 2.073214293762179e-06,
+ "loss": 0.4212,
+ "step": 750
+ },
+ {
+ "epoch": 3.540752351097179,
+ "grad_norm": 0.6821503043174744,
+ "learning_rate": 2.0666128574055575e-06,
+ "loss": 0.4199,
+ "step": 751
+ },
+ {
+ "epoch": 3.5454545454545454,
+ "grad_norm": 0.5294732451438904,
+ "learning_rate": 2.0600145350609585e-06,
+ "loss": 0.4192,
+ "step": 752
+ },
+ {
+ "epoch": 3.5501567398119125,
+ "grad_norm": 0.515800416469574,
+ "learning_rate": 2.053419374139235e-06,
+ "loss": 0.4172,
+ "step": 753
+ },
+ {
+ "epoch": 3.554858934169279,
+ "grad_norm": 0.5241639614105225,
+ "learning_rate": 2.0468274220285295e-06,
+ "loss": 0.4138,
+ "step": 754
+ },
+ {
+ "epoch": 3.5595611285266457,
+ "grad_norm": 0.546105146408081,
+ "learning_rate": 2.0402387260939224e-06,
+ "loss": 0.4123,
+ "step": 755
+ },
+ {
+ "epoch": 3.5642633228840124,
+ "grad_norm": 0.5261510014533997,
+ "learning_rate": 2.033653333677103e-06,
+ "loss": 0.4225,
+ "step": 756
+ },
+ {
+ "epoch": 3.5689655172413794,
+ "grad_norm": 0.5825217366218567,
+ "learning_rate": 2.02707129209602e-06,
+ "loss": 0.4042,
+ "step": 757
+ },
+ {
+ "epoch": 3.573667711598746,
+ "grad_norm": 0.5916388034820557,
+ "learning_rate": 2.0204926486445463e-06,
+ "loss": 0.4222,
+ "step": 758
+ },
+ {
+ "epoch": 3.5783699059561127,
+ "grad_norm": 0.5643376708030701,
+ "learning_rate": 2.0139174505921403e-06,
+ "loss": 0.4419,
+ "step": 759
+ },
+ {
+ "epoch": 3.58307210031348,
+ "grad_norm": 0.5426534414291382,
+ "learning_rate": 2.0073457451835e-06,
+ "loss": 0.3985,
+ "step": 760
+ },
+ {
+ "epoch": 3.5877742946708464,
+ "grad_norm": 0.48811203241348267,
+ "learning_rate": 2.0007775796382335e-06,
+ "loss": 0.4249,
+ "step": 761
+ },
+ {
+ "epoch": 3.592476489028213,
+ "grad_norm": 0.5216817855834961,
+ "learning_rate": 1.994213001150508e-06,
+ "loss": 0.3931,
+ "step": 762
+ },
+ {
+ "epoch": 3.5971786833855797,
+ "grad_norm": 0.5739433169364929,
+ "learning_rate": 1.9876520568887207e-06,
+ "loss": 0.42,
+ "step": 763
+ },
+ {
+ "epoch": 3.6018808777429467,
+ "grad_norm": 0.5166419148445129,
+ "learning_rate": 1.981094793995155e-06,
+ "loss": 0.4041,
+ "step": 764
+ },
+ {
+ "epoch": 3.6065830721003134,
+ "grad_norm": 0.6763928532600403,
+ "learning_rate": 1.974541259585641e-06,
+ "loss": 0.4319,
+ "step": 765
+ },
+ {
+ "epoch": 3.6112852664576804,
+ "grad_norm": 0.5443664789199829,
+ "learning_rate": 1.9679915007492194e-06,
+ "loss": 0.4139,
+ "step": 766
+ },
+ {
+ "epoch": 3.615987460815047,
+ "grad_norm": 0.6719280481338501,
+ "learning_rate": 1.9614455645478047e-06,
+ "loss": 0.4015,
+ "step": 767
+ },
+ {
+ "epoch": 3.6206896551724137,
+ "grad_norm": 0.5685383677482605,
+ "learning_rate": 1.9549034980158403e-06,
+ "loss": 0.4153,
+ "step": 768
+ },
+ {
+ "epoch": 3.6253918495297803,
+ "grad_norm": 0.5463993549346924,
+ "learning_rate": 1.9483653481599697e-06,
+ "loss": 0.4193,
+ "step": 769
+ },
+ {
+ "epoch": 3.6300940438871474,
+ "grad_norm": 0.5228095054626465,
+ "learning_rate": 1.9418311619586897e-06,
+ "loss": 0.4268,
+ "step": 770
+ },
+ {
+ "epoch": 3.634796238244514,
+ "grad_norm": 0.6472461223602295,
+ "learning_rate": 1.935300986362018e-06,
+ "loss": 0.3981,
+ "step": 771
+ },
+ {
+ "epoch": 3.639498432601881,
+ "grad_norm": 0.61808842420578,
+ "learning_rate": 1.9287748682911582e-06,
+ "loss": 0.4313,
+ "step": 772
+ },
+ {
+ "epoch": 3.6442006269592477,
+ "grad_norm": 0.5122710466384888,
+ "learning_rate": 1.9222528546381543e-06,
+ "loss": 0.4219,
+ "step": 773
+ },
+ {
+ "epoch": 3.6489028213166144,
+ "grad_norm": 0.5540320873260498,
+ "learning_rate": 1.9157349922655648e-06,
+ "loss": 0.4001,
+ "step": 774
+ },
+ {
+ "epoch": 3.653605015673981,
+ "grad_norm": 0.5066401958465576,
+ "learning_rate": 1.909221328006114e-06,
+ "loss": 0.4089,
+ "step": 775
+ },
+ {
+ "epoch": 3.658307210031348,
+ "grad_norm": 0.5802583694458008,
+ "learning_rate": 1.9027119086623647e-06,
+ "loss": 0.4216,
+ "step": 776
+ },
+ {
+ "epoch": 3.6630094043887147,
+ "grad_norm": 0.5735054016113281,
+ "learning_rate": 1.8962067810063806e-06,
+ "loss": 0.4372,
+ "step": 777
+ },
+ {
+ "epoch": 3.6677115987460818,
+ "grad_norm": 0.5177802443504333,
+ "learning_rate": 1.8897059917793844e-06,
+ "loss": 0.3912,
+ "step": 778
+ },
+ {
+ "epoch": 3.6724137931034484,
+ "grad_norm": 0.5790892243385315,
+ "learning_rate": 1.8832095876914268e-06,
+ "loss": 0.4096,
+ "step": 779
+ },
+ {
+ "epoch": 3.677115987460815,
+ "grad_norm": 0.5386017560958862,
+ "learning_rate": 1.8767176154210537e-06,
+ "loss": 0.4191,
+ "step": 780
+ },
+ {
+ "epoch": 3.6818181818181817,
+ "grad_norm": 0.5927474498748779,
+ "learning_rate": 1.8702301216149616e-06,
+ "loss": 0.4061,
+ "step": 781
+ },
+ {
+ "epoch": 3.6865203761755487,
+ "grad_norm": 0.5609317421913147,
+ "learning_rate": 1.8637471528876727e-06,
+ "loss": 0.4067,
+ "step": 782
+ },
+ {
+ "epoch": 3.6912225705329154,
+ "grad_norm": 0.6609043478965759,
+ "learning_rate": 1.8572687558211923e-06,
+ "loss": 0.4183,
+ "step": 783
+ },
+ {
+ "epoch": 3.695924764890282,
+ "grad_norm": 0.5092527270317078,
+ "learning_rate": 1.850794976964677e-06,
+ "loss": 0.3827,
+ "step": 784
+ },
+ {
+ "epoch": 3.700626959247649,
+ "grad_norm": 0.8918034434318542,
+ "learning_rate": 1.8443258628341026e-06,
+ "loss": 0.4144,
+ "step": 785
+ },
+ {
+ "epoch": 3.7053291536050157,
+ "grad_norm": 0.5443233847618103,
+ "learning_rate": 1.837861459911925e-06,
+ "loss": 0.4246,
+ "step": 786
+ },
+ {
+ "epoch": 3.7100313479623823,
+ "grad_norm": 0.6559080481529236,
+ "learning_rate": 1.8314018146467505e-06,
+ "loss": 0.4067,
+ "step": 787
+ },
+ {
+ "epoch": 3.714733542319749,
+ "grad_norm": 0.5071741342544556,
+ "learning_rate": 1.8249469734529995e-06,
+ "loss": 0.3888,
+ "step": 788
+ },
+ {
+ "epoch": 3.719435736677116,
+ "grad_norm": 0.5663676261901855,
+ "learning_rate": 1.818496982710572e-06,
+ "loss": 0.4256,
+ "step": 789
+ },
+ {
+ "epoch": 3.7241379310344827,
+ "grad_norm": 0.5477777719497681,
+ "learning_rate": 1.81205188876452e-06,
+ "loss": 0.423,
+ "step": 790
+ },
+ {
+ "epoch": 3.7288401253918497,
+ "grad_norm": 0.5709276795387268,
+ "learning_rate": 1.8056117379247078e-06,
+ "loss": 0.4265,
+ "step": 791
+ },
+ {
+ "epoch": 3.7335423197492164,
+ "grad_norm": 0.49602681398391724,
+ "learning_rate": 1.7991765764654813e-06,
+ "loss": 0.4141,
+ "step": 792
+ },
+ {
+ "epoch": 3.738244514106583,
+ "grad_norm": 0.5358700156211853,
+ "learning_rate": 1.7927464506253394e-06,
+ "loss": 0.4231,
+ "step": 793
+ },
+ {
+ "epoch": 3.7429467084639496,
+ "grad_norm": 1.1592613458633423,
+ "learning_rate": 1.7863214066065951e-06,
+ "loss": 0.3929,
+ "step": 794
+ },
+ {
+ "epoch": 3.7476489028213167,
+ "grad_norm": 0.5176786780357361,
+ "learning_rate": 1.779901490575051e-06,
+ "loss": 0.4201,
+ "step": 795
+ },
+ {
+ "epoch": 3.7523510971786833,
+ "grad_norm": 0.5303675532341003,
+ "learning_rate": 1.7734867486596596e-06,
+ "loss": 0.4201,
+ "step": 796
+ },
+ {
+ "epoch": 3.7570532915360504,
+ "grad_norm": 0.5633402466773987,
+ "learning_rate": 1.767077226952198e-06,
+ "loss": 0.4276,
+ "step": 797
+ },
+ {
+ "epoch": 3.761755485893417,
+ "grad_norm": 0.6016635894775391,
+ "learning_rate": 1.7606729715069349e-06,
+ "loss": 0.4143,
+ "step": 798
+ },
+ {
+ "epoch": 3.7664576802507836,
+ "grad_norm": 0.5202106237411499,
+ "learning_rate": 1.7542740283402981e-06,
+ "loss": 0.4195,
+ "step": 799
+ },
+ {
+ "epoch": 3.7711598746081503,
+ "grad_norm": 0.6279420852661133,
+ "learning_rate": 1.7478804434305466e-06,
+ "loss": 0.4001,
+ "step": 800
+ },
+ {
+ "epoch": 3.7758620689655173,
+ "grad_norm": 0.5253601670265198,
+ "learning_rate": 1.741492262717438e-06,
+ "loss": 0.4206,
+ "step": 801
+ },
+ {
+ "epoch": 3.780564263322884,
+ "grad_norm": 0.5218167901039124,
+ "learning_rate": 1.7351095321018974e-06,
+ "loss": 0.387,
+ "step": 802
+ },
+ {
+ "epoch": 3.785266457680251,
+ "grad_norm": 0.530846357345581,
+ "learning_rate": 1.7287322974456933e-06,
+ "loss": 0.3935,
+ "step": 803
+ },
+ {
+ "epoch": 3.7899686520376177,
+ "grad_norm": 0.5487862825393677,
+ "learning_rate": 1.7223606045711006e-06,
+ "loss": 0.4168,
+ "step": 804
+ },
+ {
+ "epoch": 3.7946708463949843,
+ "grad_norm": 0.5345083475112915,
+ "learning_rate": 1.7159944992605774e-06,
+ "loss": 0.4208,
+ "step": 805
+ },
+ {
+ "epoch": 3.799373040752351,
+ "grad_norm": 0.5425072312355042,
+ "learning_rate": 1.7096340272564318e-06,
+ "loss": 0.4088,
+ "step": 806
+ },
+ {
+ "epoch": 3.804075235109718,
+ "grad_norm": 0.5253011584281921,
+ "learning_rate": 1.7032792342604947e-06,
+ "loss": 0.3995,
+ "step": 807
+ },
+ {
+ "epoch": 3.8087774294670846,
+ "grad_norm": 0.7746017575263977,
+ "learning_rate": 1.6969301659337944e-06,
+ "loss": 0.4145,
+ "step": 808
+ },
+ {
+ "epoch": 3.8134796238244513,
+ "grad_norm": 0.7049569487571716,
+ "learning_rate": 1.6905868678962225e-06,
+ "loss": 0.4216,
+ "step": 809
+ },
+ {
+ "epoch": 3.8181818181818183,
+ "grad_norm": 0.602180540561676,
+ "learning_rate": 1.684249385726211e-06,
+ "loss": 0.4134,
+ "step": 810
+ },
+ {
+ "epoch": 3.822884012539185,
+ "grad_norm": 0.5291408896446228,
+ "learning_rate": 1.677917764960404e-06,
+ "loss": 0.402,
+ "step": 811
+ },
+ {
+ "epoch": 3.8275862068965516,
+ "grad_norm": 0.5529280304908752,
+ "learning_rate": 1.6715920510933277e-06,
+ "loss": 0.4322,
+ "step": 812
+ },
+ {
+ "epoch": 3.8322884012539182,
+ "grad_norm": 0.5989758968353271,
+ "learning_rate": 1.6652722895770676e-06,
+ "loss": 0.4275,
+ "step": 813
+ },
+ {
+ "epoch": 3.8369905956112853,
+ "grad_norm": 0.5088624358177185,
+ "learning_rate": 1.6589585258209383e-06,
+ "loss": 0.378,
+ "step": 814
+ },
+ {
+ "epoch": 3.841692789968652,
+ "grad_norm": 0.5167607665061951,
+ "learning_rate": 1.6526508051911588e-06,
+ "loss": 0.4221,
+ "step": 815
+ },
+ {
+ "epoch": 3.846394984326019,
+ "grad_norm": 0.5582865476608276,
+ "learning_rate": 1.6463491730105282e-06,
+ "loss": 0.4091,
+ "step": 816
+ },
+ {
+ "epoch": 3.8510971786833856,
+ "grad_norm": 0.5103083252906799,
+ "learning_rate": 1.6400536745580955e-06,
+ "loss": 0.3867,
+ "step": 817
+ },
+ {
+ "epoch": 3.8557993730407523,
+ "grad_norm": 0.528692901134491,
+ "learning_rate": 1.6337643550688408e-06,
+ "loss": 0.4178,
+ "step": 818
+ },
+ {
+ "epoch": 3.860501567398119,
+ "grad_norm": 0.5174258947372437,
+ "learning_rate": 1.627481259733343e-06,
+ "loss": 0.3989,
+ "step": 819
+ },
+ {
+ "epoch": 3.865203761755486,
+ "grad_norm": 0.492735892534256,
+ "learning_rate": 1.6212044336974598e-06,
+ "loss": 0.3935,
+ "step": 820
+ },
+ {
+ "epoch": 3.8699059561128526,
+ "grad_norm": 0.5810956954956055,
+ "learning_rate": 1.614933922062003e-06,
+ "loss": 0.4082,
+ "step": 821
+ },
+ {
+ "epoch": 3.8746081504702197,
+ "grad_norm": 0.5235511660575867,
+ "learning_rate": 1.6086697698824144e-06,
+ "loss": 0.4026,
+ "step": 822
+ },
+ {
+ "epoch": 3.8793103448275863,
+ "grad_norm": 0.5972744822502136,
+ "learning_rate": 1.6024120221684373e-06,
+ "loss": 0.4018,
+ "step": 823
+ },
+ {
+ "epoch": 3.884012539184953,
+ "grad_norm": 0.5685083270072937,
+ "learning_rate": 1.5961607238838022e-06,
+ "loss": 0.4077,
+ "step": 824
+ },
+ {
+ "epoch": 3.8887147335423196,
+ "grad_norm": 0.5427765250205994,
+ "learning_rate": 1.589915919945894e-06,
+ "loss": 0.4187,
+ "step": 825
+ },
+ {
+ "epoch": 3.8934169278996866,
+ "grad_norm": 0.6297295093536377,
+ "learning_rate": 1.5836776552254386e-06,
+ "loss": 0.4367,
+ "step": 826
+ },
+ {
+ "epoch": 3.8981191222570533,
+ "grad_norm": 0.6110124588012695,
+ "learning_rate": 1.5774459745461711e-06,
+ "loss": 0.4065,
+ "step": 827
+ },
+ {
+ "epoch": 3.9028213166144203,
+ "grad_norm": 0.4981592297554016,
+ "learning_rate": 1.5712209226845201e-06,
+ "loss": 0.3836,
+ "step": 828
+ },
+ {
+ "epoch": 3.907523510971787,
+ "grad_norm": 0.5722451210021973,
+ "learning_rate": 1.565002544369286e-06,
+ "loss": 0.4161,
+ "step": 829
+ },
+ {
+ "epoch": 3.9122257053291536,
+ "grad_norm": 0.6718733310699463,
+ "learning_rate": 1.5587908842813142e-06,
+ "loss": 0.4053,
+ "step": 830
+ },
+ {
+ "epoch": 3.91692789968652,
+ "grad_norm": 0.5070095658302307,
+ "learning_rate": 1.5525859870531823e-06,
+ "loss": 0.4198,
+ "step": 831
+ },
+ {
+ "epoch": 3.9216300940438873,
+ "grad_norm": 0.5303407311439514,
+ "learning_rate": 1.5463878972688707e-06,
+ "loss": 0.4089,
+ "step": 832
+ },
+ {
+ "epoch": 3.926332288401254,
+ "grad_norm": 0.5431908369064331,
+ "learning_rate": 1.5401966594634483e-06,
+ "loss": 0.4341,
+ "step": 833
+ },
+ {
+ "epoch": 3.9310344827586206,
+ "grad_norm": 0.549174427986145,
+ "learning_rate": 1.5340123181227495e-06,
+ "loss": 0.4237,
+ "step": 834
+ },
+ {
+ "epoch": 3.9357366771159876,
+ "grad_norm": 0.8902267217636108,
+ "learning_rate": 1.527834917683058e-06,
+ "loss": 0.3904,
+ "step": 835
+ },
+ {
+ "epoch": 3.9404388714733543,
+ "grad_norm": 0.5055849552154541,
+ "learning_rate": 1.5216645025307813e-06,
+ "loss": 0.4058,
+ "step": 836
+ },
+ {
+ "epoch": 3.945141065830721,
+ "grad_norm": 0.5319788455963135,
+ "learning_rate": 1.5155011170021399e-06,
+ "loss": 0.4153,
+ "step": 837
+ },
+ {
+ "epoch": 3.9498432601880875,
+ "grad_norm": 0.5441375374794006,
+ "learning_rate": 1.5093448053828402e-06,
+ "loss": 0.4231,
+ "step": 838
+ },
+ {
+ "epoch": 3.9545454545454546,
+ "grad_norm": 0.5940942764282227,
+ "learning_rate": 1.503195611907764e-06,
+ "loss": 0.4241,
+ "step": 839
+ },
+ {
+ "epoch": 3.959247648902821,
+ "grad_norm": 0.5203325748443604,
+ "learning_rate": 1.4970535807606453e-06,
+ "loss": 0.3842,
+ "step": 840
+ },
+ {
+ "epoch": 3.9639498432601883,
+ "grad_norm": 0.525404691696167,
+ "learning_rate": 1.4909187560737542e-06,
+ "loss": 0.3954,
+ "step": 841
+ },
+ {
+ "epoch": 3.968652037617555,
+ "grad_norm": 0.5999636054039001,
+ "learning_rate": 1.4847911819275829e-06,
+ "loss": 0.4061,
+ "step": 842
+ },
+ {
+ "epoch": 3.9733542319749215,
+ "grad_norm": 0.5253078937530518,
+ "learning_rate": 1.4786709023505224e-06,
+ "loss": 0.3969,
+ "step": 843
+ },
+ {
+ "epoch": 3.978056426332288,
+ "grad_norm": 0.535467803478241,
+ "learning_rate": 1.4725579613185549e-06,
+ "loss": 0.4241,
+ "step": 844
+ },
+ {
+ "epoch": 3.9827586206896552,
+ "grad_norm": 0.5458933711051941,
+ "learning_rate": 1.4664524027549291e-06,
+ "loss": 0.4102,
+ "step": 845
+ },
+ {
+ "epoch": 3.987460815047022,
+ "grad_norm": 0.515102207660675,
+ "learning_rate": 1.4603542705298493e-06,
+ "loss": 0.3957,
+ "step": 846
+ },
+ {
+ "epoch": 3.992163009404389,
+ "grad_norm": 0.572600245475769,
+ "learning_rate": 1.4542636084601624e-06,
+ "loss": 0.3686,
+ "step": 847
+ },
+ {
+ "epoch": 3.9968652037617556,
+ "grad_norm": 0.520165205001831,
+ "learning_rate": 1.4481804603090358e-06,
+ "loss": 0.4109,
+ "step": 848
+ },
+ {
+ "epoch": 4.004702194357367,
+ "grad_norm": 0.9280151128768921,
+ "learning_rate": 1.4421048697856494e-06,
+ "loss": 0.7531,
+ "step": 849
+ },
+ {
+ "epoch": 4.009404388714733,
+ "grad_norm": 0.5386480093002319,
+ "learning_rate": 1.4360368805448788e-06,
+ "loss": 0.3782,
+ "step": 850
+ },
+ {
+ "epoch": 4.0141065830721,
+ "grad_norm": 0.5074192881584167,
+ "learning_rate": 1.4299765361869837e-06,
+ "loss": 0.3971,
+ "step": 851
+ },
+ {
+ "epoch": 4.018808777429467,
+ "grad_norm": 0.55893874168396,
+ "learning_rate": 1.4239238802572908e-06,
+ "loss": 0.3553,
+ "step": 852
+ },
+ {
+ "epoch": 4.023510971786834,
+ "grad_norm": 0.5474048852920532,
+ "learning_rate": 1.4178789562458847e-06,
+ "loss": 0.3953,
+ "step": 853
+ },
+ {
+ "epoch": 4.028213166144201,
+ "grad_norm": 0.5103669166564941,
+ "learning_rate": 1.4118418075872936e-06,
+ "loss": 0.3801,
+ "step": 854
+ },
+ {
+ "epoch": 4.032915360501567,
+ "grad_norm": 0.48109811544418335,
+ "learning_rate": 1.405812477660178e-06,
+ "loss": 0.3786,
+ "step": 855
+ },
+ {
+ "epoch": 4.037617554858934,
+ "grad_norm": 0.6493998765945435,
+ "learning_rate": 1.3997910097870165e-06,
+ "loss": 0.4014,
+ "step": 856
+ },
+ {
+ "epoch": 4.0423197492163006,
+ "grad_norm": 0.5369696617126465,
+ "learning_rate": 1.3937774472337994e-06,
+ "loss": 0.4058,
+ "step": 857
+ },
+ {
+ "epoch": 4.047021943573668,
+ "grad_norm": 0.5302414894104004,
+ "learning_rate": 1.3877718332097146e-06,
+ "loss": 0.3923,
+ "step": 858
+ },
+ {
+ "epoch": 4.051724137931035,
+ "grad_norm": 0.652701199054718,
+ "learning_rate": 1.3817742108668333e-06,
+ "loss": 0.3972,
+ "step": 859
+ },
+ {
+ "epoch": 4.056426332288401,
+ "grad_norm": 0.5448158979415894,
+ "learning_rate": 1.3757846232998118e-06,
+ "loss": 0.3378,
+ "step": 860
+ },
+ {
+ "epoch": 4.061128526645768,
+ "grad_norm": 0.5433962345123291,
+ "learning_rate": 1.369803113545566e-06,
+ "loss": 0.4121,
+ "step": 861
+ },
+ {
+ "epoch": 4.065830721003135,
+ "grad_norm": 0.5282460451126099,
+ "learning_rate": 1.3638297245829762e-06,
+ "loss": 0.4061,
+ "step": 862
+ },
+ {
+ "epoch": 4.070532915360501,
+ "grad_norm": 0.5211827754974365,
+ "learning_rate": 1.3578644993325701e-06,
+ "loss": 0.4047,
+ "step": 863
+ },
+ {
+ "epoch": 4.075235109717869,
+ "grad_norm": 0.5428538918495178,
+ "learning_rate": 1.3519074806562165e-06,
+ "loss": 0.3947,
+ "step": 864
+ },
+ {
+ "epoch": 4.079937304075235,
+ "grad_norm": 0.5352445244789124,
+ "learning_rate": 1.3459587113568208e-06,
+ "loss": 0.3947,
+ "step": 865
+ },
+ {
+ "epoch": 4.084639498432602,
+ "grad_norm": 0.5329545140266418,
+ "learning_rate": 1.340018234178009e-06,
+ "loss": 0.3987,
+ "step": 866
+ },
+ {
+ "epoch": 4.089341692789969,
+ "grad_norm": 0.5108675956726074,
+ "learning_rate": 1.3340860918038295e-06,
+ "loss": 0.3627,
+ "step": 867
+ },
+ {
+ "epoch": 4.094043887147335,
+ "grad_norm": 0.5213317275047302,
+ "learning_rate": 1.328162326858442e-06,
+ "loss": 0.388,
+ "step": 868
+ },
+ {
+ "epoch": 4.098746081504702,
+ "grad_norm": 0.5299095511436462,
+ "learning_rate": 1.3222469819058112e-06,
+ "loss": 0.3975,
+ "step": 869
+ },
+ {
+ "epoch": 4.103448275862069,
+ "grad_norm": 0.5315486788749695,
+ "learning_rate": 1.3163400994494025e-06,
+ "loss": 0.3989,
+ "step": 870
+ },
+ {
+ "epoch": 4.108150470219436,
+ "grad_norm": 0.614090621471405,
+ "learning_rate": 1.3104417219318762e-06,
+ "loss": 0.3848,
+ "step": 871
+ },
+ {
+ "epoch": 4.112852664576803,
+ "grad_norm": 0.5592188239097595,
+ "learning_rate": 1.3045518917347791e-06,
+ "loss": 0.3928,
+ "step": 872
+ },
+ {
+ "epoch": 4.117554858934169,
+ "grad_norm": 0.551544725894928,
+ "learning_rate": 1.2986706511782476e-06,
+ "loss": 0.3878,
+ "step": 873
+ },
+ {
+ "epoch": 4.122257053291536,
+ "grad_norm": 0.5453651547431946,
+ "learning_rate": 1.2927980425206968e-06,
+ "loss": 0.391,
+ "step": 874
+ },
+ {
+ "epoch": 4.1269592476489025,
+ "grad_norm": 0.5152665972709656,
+ "learning_rate": 1.2869341079585184e-06,
+ "loss": 0.391,
+ "step": 875
+ },
+ {
+ "epoch": 4.131661442006269,
+ "grad_norm": 0.5348275303840637,
+ "learning_rate": 1.2810788896257804e-06,
+ "loss": 0.3603,
+ "step": 876
+ },
+ {
+ "epoch": 4.136363636363637,
+ "grad_norm": 0.7541768550872803,
+ "learning_rate": 1.2752324295939178e-06,
+ "loss": 0.3979,
+ "step": 877
+ },
+ {
+ "epoch": 4.141065830721003,
+ "grad_norm": 0.5464813709259033,
+ "learning_rate": 1.2693947698714409e-06,
+ "loss": 0.4174,
+ "step": 878
+ },
+ {
+ "epoch": 4.14576802507837,
+ "grad_norm": 0.527622401714325,
+ "learning_rate": 1.263565952403622e-06,
+ "loss": 0.3854,
+ "step": 879
+ },
+ {
+ "epoch": 4.150470219435737,
+ "grad_norm": 0.5733200311660767,
+ "learning_rate": 1.2577460190722013e-06,
+ "loss": 0.3989,
+ "step": 880
+ },
+ {
+ "epoch": 4.155172413793103,
+ "grad_norm": 0.587824285030365,
+ "learning_rate": 1.2519350116950842e-06,
+ "loss": 0.4014,
+ "step": 881
+ },
+ {
+ "epoch": 4.15987460815047,
+ "grad_norm": 0.5412627458572388,
+ "learning_rate": 1.2461329720260403e-06,
+ "loss": 0.3861,
+ "step": 882
+ },
+ {
+ "epoch": 4.164576802507837,
+ "grad_norm": 0.5781810283660889,
+ "learning_rate": 1.2403399417544033e-06,
+ "loss": 0.3977,
+ "step": 883
+ },
+ {
+ "epoch": 4.169278996865204,
+ "grad_norm": 0.5613389015197754,
+ "learning_rate": 1.2345559625047718e-06,
+ "loss": 0.404,
+ "step": 884
+ },
+ {
+ "epoch": 4.173981191222571,
+ "grad_norm": 0.5975982546806335,
+ "learning_rate": 1.2287810758367104e-06,
+ "loss": 0.4085,
+ "step": 885
+ },
+ {
+ "epoch": 4.178683385579937,
+ "grad_norm": 0.6386556029319763,
+ "learning_rate": 1.2230153232444511e-06,
+ "loss": 0.3885,
+ "step": 886
+ },
+ {
+ "epoch": 4.183385579937304,
+ "grad_norm": 0.5472486019134521,
+ "learning_rate": 1.217258746156594e-06,
+ "loss": 0.3806,
+ "step": 887
+ },
+ {
+ "epoch": 4.1880877742946705,
+ "grad_norm": 0.7230023145675659,
+ "learning_rate": 1.2115113859358118e-06,
+ "loss": 0.3846,
+ "step": 888
+ },
+ {
+ "epoch": 4.192789968652038,
+ "grad_norm": 0.5451399683952332,
+ "learning_rate": 1.2057732838785514e-06,
+ "loss": 0.3681,
+ "step": 889
+ },
+ {
+ "epoch": 4.197492163009405,
+ "grad_norm": 0.6396780610084534,
+ "learning_rate": 1.2000444812147333e-06,
+ "loss": 0.3812,
+ "step": 890
+ },
+ {
+ "epoch": 4.202194357366771,
+ "grad_norm": 0.6256916522979736,
+ "learning_rate": 1.1943250191074664e-06,
+ "loss": 0.4002,
+ "step": 891
+ },
+ {
+ "epoch": 4.206896551724138,
+ "grad_norm": 0.5897160768508911,
+ "learning_rate": 1.188614938652738e-06,
+ "loss": 0.4073,
+ "step": 892
+ },
+ {
+ "epoch": 4.2115987460815045,
+ "grad_norm": 0.5560889840126038,
+ "learning_rate": 1.1829142808791294e-06,
+ "loss": 0.3689,
+ "step": 893
+ },
+ {
+ "epoch": 4.216300940438871,
+ "grad_norm": 0.5476351380348206,
+ "learning_rate": 1.177223086747516e-06,
+ "loss": 0.3795,
+ "step": 894
+ },
+ {
+ "epoch": 4.221003134796238,
+ "grad_norm": 0.5640100240707397,
+ "learning_rate": 1.1715413971507747e-06,
+ "loss": 0.3935,
+ "step": 895
+ },
+ {
+ "epoch": 4.225705329153605,
+ "grad_norm": 0.5437642335891724,
+ "learning_rate": 1.1658692529134888e-06,
+ "loss": 0.3791,
+ "step": 896
+ },
+ {
+ "epoch": 4.230407523510972,
+ "grad_norm": 0.7564667463302612,
+ "learning_rate": 1.1602066947916565e-06,
+ "loss": 0.4002,
+ "step": 897
+ },
+ {
+ "epoch": 4.235109717868339,
+ "grad_norm": 0.5328983664512634,
+ "learning_rate": 1.154553763472396e-06,
+ "loss": 0.3495,
+ "step": 898
+ },
+ {
+ "epoch": 4.239811912225705,
+ "grad_norm": 0.5688467025756836,
+ "learning_rate": 1.1489104995736543e-06,
+ "loss": 0.3807,
+ "step": 899
+ },
+ {
+ "epoch": 4.244514106583072,
+ "grad_norm": 0.5422545075416565,
+ "learning_rate": 1.1432769436439162e-06,
+ "loss": 0.3955,
+ "step": 900
+ },
+ {
+ "epoch": 4.2492163009404385,
+ "grad_norm": 0.5231274366378784,
+ "learning_rate": 1.1376531361619105e-06,
+ "loss": 0.4035,
+ "step": 901
+ },
+ {
+ "epoch": 4.253918495297806,
+ "grad_norm": 0.578623354434967,
+ "learning_rate": 1.1320391175363225e-06,
+ "loss": 0.3796,
+ "step": 902
+ },
+ {
+ "epoch": 4.258620689655173,
+ "grad_norm": 0.5331007838249207,
+ "learning_rate": 1.126434928105497e-06,
+ "loss": 0.3841,
+ "step": 903
+ },
+ {
+ "epoch": 4.263322884012539,
+ "grad_norm": 0.5077575445175171,
+ "learning_rate": 1.1208406081371612e-06,
+ "loss": 0.386,
+ "step": 904
+ },
+ {
+ "epoch": 4.268025078369906,
+ "grad_norm": 0.5260904431343079,
+ "learning_rate": 1.11525619782812e-06,
+ "loss": 0.4004,
+ "step": 905
+ },
+ {
+ "epoch": 4.2727272727272725,
+ "grad_norm": 0.5973961353302002,
+ "learning_rate": 1.1096817373039773e-06,
+ "loss": 0.4038,
+ "step": 906
+ },
+ {
+ "epoch": 4.277429467084639,
+ "grad_norm": 0.5325058102607727,
+ "learning_rate": 1.104117266618846e-06,
+ "loss": 0.3961,
+ "step": 907
+ },
+ {
+ "epoch": 4.282131661442007,
+ "grad_norm": 0.5536799430847168,
+ "learning_rate": 1.0985628257550575e-06,
+ "loss": 0.3844,
+ "step": 908
+ },
+ {
+ "epoch": 4.286833855799373,
+ "grad_norm": 0.6204715371131897,
+ "learning_rate": 1.0930184546228769e-06,
+ "loss": 0.3916,
+ "step": 909
+ },
+ {
+ "epoch": 4.29153605015674,
+ "grad_norm": 0.5359520316123962,
+ "learning_rate": 1.087484193060215e-06,
+ "loss": 0.3612,
+ "step": 910
+ },
+ {
+ "epoch": 4.2962382445141065,
+ "grad_norm": 0.7552776336669922,
+ "learning_rate": 1.0819600808323424e-06,
+ "loss": 0.3986,
+ "step": 911
+ },
+ {
+ "epoch": 4.300940438871473,
+ "grad_norm": 0.545625627040863,
+ "learning_rate": 1.0764461576316041e-06,
+ "loss": 0.3829,
+ "step": 912
+ },
+ {
+ "epoch": 4.30564263322884,
+ "grad_norm": 0.5795807838439941,
+ "learning_rate": 1.0709424630771333e-06,
+ "loss": 0.3985,
+ "step": 913
+ },
+ {
+ "epoch": 4.310344827586207,
+ "grad_norm": 0.621943473815918,
+ "learning_rate": 1.0654490367145684e-06,
+ "loss": 0.3882,
+ "step": 914
+ },
+ {
+ "epoch": 4.315047021943574,
+ "grad_norm": 0.5678103566169739,
+ "learning_rate": 1.0599659180157678e-06,
+ "loss": 0.4061,
+ "step": 915
+ },
+ {
+ "epoch": 4.3197492163009406,
+ "grad_norm": 0.5638558268547058,
+ "learning_rate": 1.0544931463785237e-06,
+ "loss": 0.4247,
+ "step": 916
+ },
+ {
+ "epoch": 4.324451410658307,
+ "grad_norm": 0.5709723234176636,
+ "learning_rate": 1.049030761126287e-06,
+ "loss": 0.4002,
+ "step": 917
+ },
+ {
+ "epoch": 4.329153605015674,
+ "grad_norm": 0.5887544751167297,
+ "learning_rate": 1.043578801507874e-06,
+ "loss": 0.381,
+ "step": 918
+ },
+ {
+ "epoch": 4.33385579937304,
+ "grad_norm": 0.5499666929244995,
+ "learning_rate": 1.038137306697193e-06,
+ "loss": 0.4029,
+ "step": 919
+ },
+ {
+ "epoch": 4.338557993730408,
+ "grad_norm": 0.676122784614563,
+ "learning_rate": 1.0327063157929582e-06,
+ "loss": 0.3925,
+ "step": 920
+ },
+ {
+ "epoch": 4.343260188087775,
+ "grad_norm": 0.5894976258277893,
+ "learning_rate": 1.027285867818411e-06,
+ "loss": 0.3945,
+ "step": 921
+ },
+ {
+ "epoch": 4.347962382445141,
+ "grad_norm": 0.9533663392066956,
+ "learning_rate": 1.021876001721039e-06,
+ "loss": 0.3402,
+ "step": 922
+ },
+ {
+ "epoch": 4.352664576802508,
+ "grad_norm": 0.5602714419364929,
+ "learning_rate": 1.016476756372295e-06,
+ "loss": 0.3901,
+ "step": 923
+ },
+ {
+ "epoch": 4.3573667711598745,
+ "grad_norm": 0.5252093076705933,
+ "learning_rate": 1.011088170567319e-06,
+ "loss": 0.3807,
+ "step": 924
+ },
+ {
+ "epoch": 4.362068965517241,
+ "grad_norm": 0.5782448053359985,
+ "learning_rate": 1.0057102830246596e-06,
+ "loss": 0.373,
+ "step": 925
+ },
+ {
+ "epoch": 4.366771159874608,
+ "grad_norm": 0.5740293264389038,
+ "learning_rate": 1.0003431323859943e-06,
+ "loss": 0.4013,
+ "step": 926
+ },
+ {
+ "epoch": 4.371473354231975,
+ "grad_norm": 0.5553807616233826,
+ "learning_rate": 9.949867572158544e-07,
+ "loss": 0.3909,
+ "step": 927
+ },
+ {
+ "epoch": 4.376175548589342,
+ "grad_norm": 0.5707646012306213,
+ "learning_rate": 9.896411960013455e-07,
+ "loss": 0.4001,
+ "step": 928
+ },
+ {
+ "epoch": 4.3808777429467085,
+ "grad_norm": 0.6075118184089661,
+ "learning_rate": 9.843064871518694e-07,
+ "loss": 0.3815,
+ "step": 929
+ },
+ {
+ "epoch": 4.385579937304075,
+ "grad_norm": 0.535280168056488,
+ "learning_rate": 9.78982668998856e-07,
+ "loss": 0.3741,
+ "step": 930
+ },
+ {
+ "epoch": 4.390282131661442,
+ "grad_norm": 0.5094203352928162,
+ "learning_rate": 9.736697797954766e-07,
+ "loss": 0.4004,
+ "step": 931
+ },
+ {
+ "epoch": 4.394984326018808,
+ "grad_norm": 0.5600079298019409,
+ "learning_rate": 9.683678577163788e-07,
+ "loss": 0.3935,
+ "step": 932
+ },
+ {
+ "epoch": 4.399686520376176,
+ "grad_norm": 0.5435491800308228,
+ "learning_rate": 9.630769408574065e-07,
+ "loss": 0.3676,
+ "step": 933
+ },
+ {
+ "epoch": 4.4043887147335425,
+ "grad_norm": 0.5918356776237488,
+ "learning_rate": 9.577970672353274e-07,
+ "loss": 0.373,
+ "step": 934
+ },
+ {
+ "epoch": 4.409090909090909,
+ "grad_norm": 0.547618567943573,
+ "learning_rate": 9.525282747875636e-07,
+ "loss": 0.3674,
+ "step": 935
+ },
+ {
+ "epoch": 4.413793103448276,
+ "grad_norm": 0.6398045420646667,
+ "learning_rate": 9.472706013719113e-07,
+ "loss": 0.3947,
+ "step": 936
+ },
+ {
+ "epoch": 4.418495297805642,
+ "grad_norm": 0.5805232524871826,
+ "learning_rate": 9.420240847662759e-07,
+ "loss": 0.3803,
+ "step": 937
+ },
+ {
+ "epoch": 4.423197492163009,
+ "grad_norm": 0.5517405867576599,
+ "learning_rate": 9.367887626683975e-07,
+ "loss": 0.4065,
+ "step": 938
+ },
+ {
+ "epoch": 4.427899686520377,
+ "grad_norm": 0.563588559627533,
+ "learning_rate": 9.315646726955798e-07,
+ "loss": 0.3844,
+ "step": 939
+ },
+ {
+ "epoch": 4.432601880877743,
+ "grad_norm": 0.7672348022460938,
+ "learning_rate": 9.263518523844211e-07,
+ "loss": 0.3827,
+ "step": 940
+ },
+ {
+ "epoch": 4.43730407523511,
+ "grad_norm": 0.54765784740448,
+ "learning_rate": 9.211503391905446e-07,
+ "loss": 0.3856,
+ "step": 941
+ },
+ {
+ "epoch": 4.4420062695924765,
+ "grad_norm": 0.5360795259475708,
+ "learning_rate": 9.159601704883253e-07,
+ "loss": 0.3902,
+ "step": 942
+ },
+ {
+ "epoch": 4.446708463949843,
+ "grad_norm": 0.5291644334793091,
+ "learning_rate": 9.107813835706303e-07,
+ "loss": 0.3617,
+ "step": 943
+ },
+ {
+ "epoch": 4.45141065830721,
+ "grad_norm": 0.5579796433448792,
+ "learning_rate": 9.056140156485385e-07,
+ "loss": 0.3777,
+ "step": 944
+ },
+ {
+ "epoch": 4.456112852664576,
+ "grad_norm": 0.7645874619483948,
+ "learning_rate": 9.004581038510865e-07,
+ "loss": 0.3877,
+ "step": 945
+ },
+ {
+ "epoch": 4.460815047021944,
+ "grad_norm": 0.5321459174156189,
+ "learning_rate": 8.953136852249922e-07,
+ "loss": 0.4057,
+ "step": 946
+ },
+ {
+ "epoch": 4.4655172413793105,
+ "grad_norm": 0.5971282720565796,
+ "learning_rate": 8.901807967343898e-07,
+ "loss": 0.3998,
+ "step": 947
+ },
+ {
+ "epoch": 4.470219435736677,
+ "grad_norm": 0.5772238373756409,
+ "learning_rate": 8.850594752605712e-07,
+ "loss": 0.3967,
+ "step": 948
+ },
+ {
+ "epoch": 4.474921630094044,
+ "grad_norm": 0.5422664284706116,
+ "learning_rate": 8.79949757601711e-07,
+ "loss": 0.3882,
+ "step": 949
+ },
+ {
+ "epoch": 4.47962382445141,
+ "grad_norm": 0.5209662914276123,
+ "learning_rate": 8.748516804726096e-07,
+ "loss": 0.3872,
+ "step": 950
+ },
+ {
+ "epoch": 4.484326018808777,
+ "grad_norm": 0.6436011791229248,
+ "learning_rate": 8.697652805044265e-07,
+ "loss": 0.3669,
+ "step": 951
+ },
+ {
+ "epoch": 4.4890282131661445,
+ "grad_norm": 0.5284281969070435,
+ "learning_rate": 8.646905942444172e-07,
+ "loss": 0.3731,
+ "step": 952
+ },
+ {
+ "epoch": 4.493730407523511,
+ "grad_norm": 0.857571542263031,
+ "learning_rate": 8.59627658155671e-07,
+ "loss": 0.3933,
+ "step": 953
+ },
+ {
+ "epoch": 4.498432601880878,
+ "grad_norm": 0.5689031481742859,
+ "learning_rate": 8.545765086168484e-07,
+ "loss": 0.3836,
+ "step": 954
+ },
+ {
+ "epoch": 4.503134796238244,
+ "grad_norm": 0.5461127758026123,
+ "learning_rate": 8.495371819219206e-07,
+ "loss": 0.3984,
+ "step": 955
+ },
+ {
+ "epoch": 4.507836990595611,
+ "grad_norm": 0.591744065284729,
+ "learning_rate": 8.44509714279908e-07,
+ "loss": 0.4096,
+ "step": 956
+ },
+ {
+ "epoch": 4.512539184952978,
+ "grad_norm": 0.5600095391273499,
+ "learning_rate": 8.394941418146202e-07,
+ "loss": 0.4012,
+ "step": 957
+ },
+ {
+ "epoch": 4.517241379310345,
+ "grad_norm": 0.5238003730773926,
+ "learning_rate": 8.344905005643967e-07,
+ "loss": 0.4019,
+ "step": 958
+ },
+ {
+ "epoch": 4.521943573667712,
+ "grad_norm": 0.5452944040298462,
+ "learning_rate": 8.294988264818488e-07,
+ "loss": 0.391,
+ "step": 959
+ },
+ {
+ "epoch": 4.5266457680250785,
+ "grad_norm": 0.570563554763794,
+ "learning_rate": 8.245191554335963e-07,
+ "loss": 0.3836,
+ "step": 960
+ },
+ {
+ "epoch": 4.531347962382445,
+ "grad_norm": 0.526006281375885,
+ "learning_rate": 8.1955152320002e-07,
+ "loss": 0.3894,
+ "step": 961
+ },
+ {
+ "epoch": 4.536050156739812,
+ "grad_norm": 0.6105053424835205,
+ "learning_rate": 8.145959654749924e-07,
+ "loss": 0.4004,
+ "step": 962
+ },
+ {
+ "epoch": 4.540752351097178,
+ "grad_norm": 0.6597625017166138,
+ "learning_rate": 8.096525178656306e-07,
+ "loss": 0.3694,
+ "step": 963
+ },
+ {
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.546521782875061,
+ "learning_rate": 8.047212158920362e-07,
+ "loss": 0.397,
+ "step": 964
+ },
+ {
+ "epoch": 4.5501567398119125,
+ "grad_norm": 0.518375813961029,
+ "learning_rate": 7.998020949870402e-07,
+ "loss": 0.4126,
+ "step": 965
+ },
+ {
+ "epoch": 4.554858934169279,
+ "grad_norm": 0.6008384823799133,
+ "learning_rate": 7.948951904959504e-07,
+ "loss": 0.3799,
+ "step": 966
+ },
+ {
+ "epoch": 4.559561128526646,
+ "grad_norm": 0.5546853542327881,
+ "learning_rate": 7.900005376762948e-07,
+ "loss": 0.3894,
+ "step": 967
+ },
+ {
+ "epoch": 4.564263322884012,
+ "grad_norm": 0.5475030541419983,
+ "learning_rate": 7.851181716975703e-07,
+ "loss": 0.3977,
+ "step": 968
+ },
+ {
+ "epoch": 4.568965517241379,
+ "grad_norm": 0.5156254172325134,
+ "learning_rate": 7.802481276409896e-07,
+ "loss": 0.3635,
+ "step": 969
+ },
+ {
+ "epoch": 4.5736677115987465,
+ "grad_norm": 0.5934706330299377,
+ "learning_rate": 7.75390440499228e-07,
+ "loss": 0.3735,
+ "step": 970
+ },
+ {
+ "epoch": 4.578369905956113,
+ "grad_norm": 0.5446907877922058,
+ "learning_rate": 7.705451451761734e-07,
+ "loss": 0.3722,
+ "step": 971
+ },
+ {
+ "epoch": 4.58307210031348,
+ "grad_norm": 0.5843047499656677,
+ "learning_rate": 7.657122764866754e-07,
+ "loss": 0.37,
+ "step": 972
+ },
+ {
+ "epoch": 4.587774294670846,
+ "grad_norm": 0.5700147747993469,
+ "learning_rate": 7.608918691562914e-07,
+ "loss": 0.4071,
+ "step": 973
+ },
+ {
+ "epoch": 4.592476489028213,
+ "grad_norm": 0.5433696508407593,
+ "learning_rate": 7.560839578210466e-07,
+ "loss": 0.371,
+ "step": 974
+ },
+ {
+ "epoch": 4.59717868338558,
+ "grad_norm": 1.282175064086914,
+ "learning_rate": 7.512885770271722e-07,
+ "loss": 0.3936,
+ "step": 975
+ },
+ {
+ "epoch": 4.601880877742946,
+ "grad_norm": 0.6217600107192993,
+ "learning_rate": 7.465057612308676e-07,
+ "loss": 0.3902,
+ "step": 976
+ },
+ {
+ "epoch": 4.606583072100314,
+ "grad_norm": 0.536109983921051,
+ "learning_rate": 7.417355447980484e-07,
+ "loss": 0.3955,
+ "step": 977
+ },
+ {
+ "epoch": 4.61128526645768,
+ "grad_norm": 0.5526042580604553,
+ "learning_rate": 7.369779620041001e-07,
+ "loss": 0.382,
+ "step": 978
+ },
+ {
+ "epoch": 4.615987460815047,
+ "grad_norm": 0.5479426980018616,
+ "learning_rate": 7.322330470336314e-07,
+ "loss": 0.4093,
+ "step": 979
+ },
+ {
+ "epoch": 4.620689655172414,
+ "grad_norm": 0.7818365693092346,
+ "learning_rate": 7.275008339802295e-07,
+ "loss": 0.3924,
+ "step": 980
+ },
+ {
+ "epoch": 4.62539184952978,
+ "grad_norm": 0.5750322937965393,
+ "learning_rate": 7.227813568462141e-07,
+ "loss": 0.3742,
+ "step": 981
+ },
+ {
+ "epoch": 4.630094043887147,
+ "grad_norm": 0.5750429034233093,
+ "learning_rate": 7.180746495423946e-07,
+ "loss": 0.3914,
+ "step": 982
+ },
+ {
+ "epoch": 4.6347962382445145,
+ "grad_norm": 0.5530590415000916,
+ "learning_rate": 7.133807458878247e-07,
+ "loss": 0.3896,
+ "step": 983
+ },
+ {
+ "epoch": 4.639498432601881,
+ "grad_norm": 0.5401444435119629,
+ "learning_rate": 7.086996796095599e-07,
+ "loss": 0.3832,
+ "step": 984
+ },
+ {
+ "epoch": 4.644200626959248,
+ "grad_norm": 0.5471640229225159,
+ "learning_rate": 7.040314843424173e-07,
+ "loss": 0.3922,
+ "step": 985
+ },
+ {
+ "epoch": 4.648902821316614,
+ "grad_norm": 0.5962896943092346,
+ "learning_rate": 6.99376193628728e-07,
+ "loss": 0.3839,
+ "step": 986
+ },
+ {
+ "epoch": 4.653605015673981,
+ "grad_norm": 0.5511114597320557,
+ "learning_rate": 6.947338409181056e-07,
+ "loss": 0.3867,
+ "step": 987
+ },
+ {
+ "epoch": 4.658307210031348,
+ "grad_norm": 0.5311186909675598,
+ "learning_rate": 6.90104459567196e-07,
+ "loss": 0.3886,
+ "step": 988
+ },
+ {
+ "epoch": 4.663009404388715,
+ "grad_norm": 0.7723526358604431,
+ "learning_rate": 6.854880828394442e-07,
+ "loss": 0.379,
+ "step": 989
+ },
+ {
+ "epoch": 4.667711598746082,
+ "grad_norm": 0.5676357746124268,
+ "learning_rate": 6.808847439048524e-07,
+ "loss": 0.4067,
+ "step": 990
+ },
+ {
+ "epoch": 4.672413793103448,
+ "grad_norm": 0.9501140713691711,
+ "learning_rate": 6.762944758397432e-07,
+ "loss": 0.3914,
+ "step": 991
+ },
+ {
+ "epoch": 4.677115987460815,
+ "grad_norm": 0.5385439395904541,
+ "learning_rate": 6.717173116265208e-07,
+ "loss": 0.3842,
+ "step": 992
+ },
+ {
+ "epoch": 4.681818181818182,
+ "grad_norm": 0.5315724611282349,
+ "learning_rate": 6.671532841534345e-07,
+ "loss": 0.3952,
+ "step": 993
+ },
+ {
+ "epoch": 4.686520376175548,
+ "grad_norm": 0.580390214920044,
+ "learning_rate": 6.626024262143421e-07,
+ "loss": 0.4011,
+ "step": 994
+ },
+ {
+ "epoch": 4.691222570532915,
+ "grad_norm": 0.5717929005622864,
+ "learning_rate": 6.58064770508475e-07,
+ "loss": 0.3848,
+ "step": 995
+ },
+ {
+ "epoch": 4.695924764890282,
+ "grad_norm": 0.7644345164299011,
+ "learning_rate": 6.535403496402023e-07,
+ "loss": 0.3718,
+ "step": 996
+ },
+ {
+ "epoch": 4.700626959247649,
+ "grad_norm": 0.8252847790718079,
+ "learning_rate": 6.490291961187975e-07,
+ "loss": 0.3756,
+ "step": 997
+ },
+ {
+ "epoch": 4.705329153605016,
+ "grad_norm": 0.6276743412017822,
+ "learning_rate": 6.445313423582039e-07,
+ "loss": 0.4097,
+ "step": 998
+ },
+ {
+ "epoch": 4.710031347962382,
+ "grad_norm": 0.5425130724906921,
+ "learning_rate": 6.400468206768004e-07,
+ "loss": 0.3926,
+ "step": 999
+ },
+ {
+ "epoch": 4.714733542319749,
+ "grad_norm": 0.5565195083618164,
+ "learning_rate": 6.35575663297176e-07,
+ "loss": 0.3973,
+ "step": 1000
+ },
+ {
+ "epoch": 4.7194357366771165,
+ "grad_norm": 0.5730810165405273,
+ "learning_rate": 6.31117902345888e-07,
+ "loss": 0.3448,
+ "step": 1001
+ },
+ {
+ "epoch": 4.724137931034483,
+ "grad_norm": 0.6187518835067749,
+ "learning_rate": 6.266735698532392e-07,
+ "loss": 0.387,
+ "step": 1002
+ },
+ {
+ "epoch": 4.72884012539185,
+ "grad_norm": 0.5731320381164551,
+ "learning_rate": 6.222426977530449e-07,
+ "loss": 0.4064,
+ "step": 1003
+ },
+ {
+ "epoch": 4.733542319749216,
+ "grad_norm": 0.5795004367828369,
+ "learning_rate": 6.178253178824029e-07,
+ "loss": 0.3985,
+ "step": 1004
+ },
+ {
+ "epoch": 4.738244514106583,
+ "grad_norm": 0.5685634016990662,
+ "learning_rate": 6.134214619814657e-07,
+ "loss": 0.3817,
+ "step": 1005
+ },
+ {
+ "epoch": 4.74294670846395,
+ "grad_norm": 0.5926253199577332,
+ "learning_rate": 6.090311616932127e-07,
+ "loss": 0.3735,
+ "step": 1006
+ },
+ {
+ "epoch": 4.747648902821316,
+ "grad_norm": 0.5256511569023132,
+ "learning_rate": 6.04654448563221e-07,
+ "loss": 0.3805,
+ "step": 1007
+ },
+ {
+ "epoch": 4.752351097178684,
+ "grad_norm": 0.5808703303337097,
+ "learning_rate": 6.002913540394417e-07,
+ "loss": 0.3615,
+ "step": 1008
+ },
+ {
+ "epoch": 4.75705329153605,
+ "grad_norm": 0.5645278692245483,
+ "learning_rate": 5.959419094719713e-07,
+ "loss": 0.405,
+ "step": 1009
+ },
+ {
+ "epoch": 4.761755485893417,
+ "grad_norm": 0.535028874874115,
+ "learning_rate": 5.916061461128269e-07,
+ "loss": 0.3823,
+ "step": 1010
+ },
+ {
+ "epoch": 4.766457680250784,
+ "grad_norm": 0.5427082180976868,
+ "learning_rate": 5.872840951157241e-07,
+ "loss": 0.3643,
+ "step": 1011
+ },
+ {
+ "epoch": 4.77115987460815,
+ "grad_norm": 0.5948965549468994,
+ "learning_rate": 5.829757875358477e-07,
+ "loss": 0.3834,
+ "step": 1012
+ },
+ {
+ "epoch": 4.775862068965517,
+ "grad_norm": 1.4611191749572754,
+ "learning_rate": 5.786812543296372e-07,
+ "loss": 0.3923,
+ "step": 1013
+ },
+ {
+ "epoch": 4.7805642633228835,
+ "grad_norm": 0.5925397276878357,
+ "learning_rate": 5.744005263545538e-07,
+ "loss": 0.4105,
+ "step": 1014
+ },
+ {
+ "epoch": 4.785266457680251,
+ "grad_norm": 0.5865631103515625,
+ "learning_rate": 5.701336343688671e-07,
+ "loss": 0.4086,
+ "step": 1015
+ },
+ {
+ "epoch": 4.789968652037618,
+ "grad_norm": 0.5993569493293762,
+ "learning_rate": 5.658806090314322e-07,
+ "loss": 0.3738,
+ "step": 1016
+ },
+ {
+ "epoch": 4.794670846394984,
+ "grad_norm": 0.5465255975723267,
+ "learning_rate": 5.616414809014647e-07,
+ "loss": 0.3801,
+ "step": 1017
+ },
+ {
+ "epoch": 4.799373040752351,
+ "grad_norm": 0.5121073722839355,
+ "learning_rate": 5.574162804383293e-07,
+ "loss": 0.3896,
+ "step": 1018
+ },
+ {
+ "epoch": 4.804075235109718,
+ "grad_norm": 0.5888665318489075,
+ "learning_rate": 5.532050380013115e-07,
+ "loss": 0.3833,
+ "step": 1019
+ },
+ {
+ "epoch": 4.808777429467085,
+ "grad_norm": 0.5188261866569519,
+ "learning_rate": 5.490077838494079e-07,
+ "loss": 0.4127,
+ "step": 1020
+ },
+ {
+ "epoch": 4.813479623824452,
+ "grad_norm": 0.5498382449150085,
+ "learning_rate": 5.448245481411041e-07,
+ "loss": 0.3933,
+ "step": 1021
+ },
+ {
+ "epoch": 4.818181818181818,
+ "grad_norm": 0.5509280562400818,
+ "learning_rate": 5.406553609341586e-07,
+ "loss": 0.3912,
+ "step": 1022
+ },
+ {
+ "epoch": 4.822884012539185,
+ "grad_norm": 0.5588513612747192,
+ "learning_rate": 5.365002521853882e-07,
+ "loss": 0.3757,
+ "step": 1023
+ },
+ {
+ "epoch": 4.827586206896552,
+ "grad_norm": 0.5885221362113953,
+ "learning_rate": 5.32359251750452e-07,
+ "loss": 0.3836,
+ "step": 1024
+ },
+ {
+ "epoch": 4.832288401253918,
+ "grad_norm": 0.7824872136116028,
+ "learning_rate": 5.282323893836347e-07,
+ "loss": 0.4078,
+ "step": 1025
+ },
+ {
+ "epoch": 4.836990595611285,
+ "grad_norm": 0.5329296588897705,
+ "learning_rate": 5.241196947376382e-07,
+ "loss": 0.3844,
+ "step": 1026
+ },
+ {
+ "epoch": 4.841692789968652,
+ "grad_norm": 0.5577712059020996,
+ "learning_rate": 5.200211973633632e-07,
+ "loss": 0.4107,
+ "step": 1027
+ },
+ {
+ "epoch": 4.846394984326019,
+ "grad_norm": 0.854481041431427,
+ "learning_rate": 5.15936926709699e-07,
+ "loss": 0.3988,
+ "step": 1028
+ },
+ {
+ "epoch": 4.851097178683386,
+ "grad_norm": 0.5857868790626526,
+ "learning_rate": 5.118669121233127e-07,
+ "loss": 0.3935,
+ "step": 1029
+ },
+ {
+ "epoch": 4.855799373040752,
+ "grad_norm": 0.5981507897377014,
+ "learning_rate": 5.078111828484347e-07,
+ "loss": 0.3914,
+ "step": 1030
+ },
+ {
+ "epoch": 4.860501567398119,
+ "grad_norm": 0.5649446845054626,
+ "learning_rate": 5.037697680266565e-07,
+ "loss": 0.3961,
+ "step": 1031
+ },
+ {
+ "epoch": 4.8652037617554855,
+ "grad_norm": 0.5941659808158875,
+ "learning_rate": 4.997426966967106e-07,
+ "loss": 0.3942,
+ "step": 1032
+ },
+ {
+ "epoch": 4.869905956112853,
+ "grad_norm": 0.581913411617279,
+ "learning_rate": 4.957299977942704e-07,
+ "loss": 0.3806,
+ "step": 1033
+ },
+ {
+ "epoch": 4.87460815047022,
+ "grad_norm": 0.5254392027854919,
+ "learning_rate": 4.917317001517389e-07,
+ "loss": 0.3859,
+ "step": 1034
+ },
+ {
+ "epoch": 4.879310344827586,
+ "grad_norm": 0.5529137849807739,
+ "learning_rate": 4.877478324980412e-07,
+ "loss": 0.4055,
+ "step": 1035
+ },
+ {
+ "epoch": 4.884012539184953,
+ "grad_norm": 0.5569112300872803,
+ "learning_rate": 4.837784234584194e-07,
+ "loss": 0.3771,
+ "step": 1036
+ },
+ {
+ "epoch": 4.88871473354232,
+ "grad_norm": 0.6729010343551636,
+ "learning_rate": 4.79823501554226e-07,
+ "loss": 0.3983,
+ "step": 1037
+ },
+ {
+ "epoch": 4.893416927899686,
+ "grad_norm": 0.5438387989997864,
+ "learning_rate": 4.7588309520271934e-07,
+ "loss": 0.3805,
+ "step": 1038
+ },
+ {
+ "epoch": 4.898119122257054,
+ "grad_norm": 0.5601168870925903,
+ "learning_rate": 4.7195723271685893e-07,
+ "loss": 0.413,
+ "step": 1039
+ },
+ {
+ "epoch": 4.90282131661442,
+ "grad_norm": 0.5603858232498169,
+ "learning_rate": 4.6804594230510286e-07,
+ "loss": 0.4093,
+ "step": 1040
+ },
+ {
+ "epoch": 4.907523510971787,
+ "grad_norm": 0.5581585764884949,
+ "learning_rate": 4.641492520712043e-07,
+ "loss": 0.3877,
+ "step": 1041
+ },
+ {
+ "epoch": 4.912225705329154,
+ "grad_norm": 0.6802616119384766,
+ "learning_rate": 4.60267190014011e-07,
+ "loss": 0.3965,
+ "step": 1042
+ },
+ {
+ "epoch": 4.91692789968652,
+ "grad_norm": 0.5508768558502197,
+ "learning_rate": 4.563997840272602e-07,
+ "loss": 0.3833,
+ "step": 1043
+ },
+ {
+ "epoch": 4.921630094043887,
+ "grad_norm": 0.9818223714828491,
+ "learning_rate": 4.5254706189938545e-07,
+ "loss": 0.3689,
+ "step": 1044
+ },
+ {
+ "epoch": 4.9263322884012535,
+ "grad_norm": 0.5540556907653809,
+ "learning_rate": 4.4870905131330827e-07,
+ "loss": 0.4081,
+ "step": 1045
+ },
+ {
+ "epoch": 4.931034482758621,
+ "grad_norm": 0.5338829159736633,
+ "learning_rate": 4.448857798462455e-07,
+ "loss": 0.4071,
+ "step": 1046
+ },
+ {
+ "epoch": 4.935736677115988,
+ "grad_norm": 0.5587465763092041,
+ "learning_rate": 4.4107727496950913e-07,
+ "loss": 0.3801,
+ "step": 1047
+ },
+ {
+ "epoch": 4.940438871473354,
+ "grad_norm": 0.5150395631790161,
+ "learning_rate": 4.372835640483089e-07,
+ "loss": 0.4002,
+ "step": 1048
+ },
+ {
+ "epoch": 4.945141065830721,
+ "grad_norm": 0.5582529902458191,
+ "learning_rate": 4.3350467434155526e-07,
+ "loss": 0.393,
+ "step": 1049
+ },
+ {
+ "epoch": 4.9498432601880875,
+ "grad_norm": 0.5755763649940491,
+ "learning_rate": 4.297406330016643e-07,
+ "loss": 0.3838,
+ "step": 1050
+ },
+ {
+ "epoch": 4.954545454545455,
+ "grad_norm": 0.5632887482643127,
+ "learning_rate": 4.25991467074362e-07,
+ "loss": 0.3752,
+ "step": 1051
+ },
+ {
+ "epoch": 4.959247648902822,
+ "grad_norm": 0.5089020729064941,
+ "learning_rate": 4.2225720349849063e-07,
+ "loss": 0.3873,
+ "step": 1052
+ },
+ {
+ "epoch": 4.963949843260188,
+ "grad_norm": 0.6206353306770325,
+ "learning_rate": 4.185378691058145e-07,
+ "loss": 0.3837,
+ "step": 1053
+ },
+ {
+ "epoch": 4.968652037617555,
+ "grad_norm": 0.8949421048164368,
+ "learning_rate": 4.148334906208273e-07,
+ "loss": 0.4126,
+ "step": 1054
+ },
+ {
+ "epoch": 4.9733542319749215,
+ "grad_norm": 0.5514953136444092,
+ "learning_rate": 4.1114409466056107e-07,
+ "loss": 0.3897,
+ "step": 1055
+ },
+ {
+ "epoch": 4.978056426332288,
+ "grad_norm": 0.681344211101532,
+ "learning_rate": 4.0746970773439115e-07,
+ "loss": 0.4165,
+ "step": 1056
+ },
+ {
+ "epoch": 4.982758620689655,
+ "grad_norm": 0.5986515283584595,
+ "learning_rate": 4.0381035624385336e-07,
+ "loss": 0.4007,
+ "step": 1057
+ },
+ {
+ "epoch": 4.987460815047022,
+ "grad_norm": 0.502730131149292,
+ "learning_rate": 4.0016606648244555e-07,
+ "loss": 0.3614,
+ "step": 1058
+ },
+ {
+ "epoch": 4.992163009404389,
+ "grad_norm": 0.5898148417472839,
+ "learning_rate": 3.9653686463544447e-07,
+ "loss": 0.4064,
+ "step": 1059
+ },
+ {
+ "epoch": 4.996865203761756,
+ "grad_norm": 0.6192370057106018,
+ "learning_rate": 3.929227767797153e-07,
+ "loss": 0.4027,
+ "step": 1060
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 1272,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 212,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 6.827573142662572e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1060/training_args.bin b/checkpoint-1060/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7db90ca60ea3c300feb3b7d6e0cb54fc7cfb2060
--- /dev/null
+++ b/checkpoint-1060/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51f85402b182fc4b86518e0cb9ca9cbf150300e36000a38f53507b9a8663ad4b
+size 7928
diff --git a/checkpoint-1060/zero_to_fp32.py b/checkpoint-1060/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-1060/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-1272/README.md b/checkpoint-1272/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b184114a0c28ed3e4c082c18486736dc818166d
--- /dev/null
+++ b/checkpoint-1272/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.3-70B-Instruct
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-1272/adapter_config.json b/checkpoint-1272/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc930e1be2d901773c96d6e6d186c72676cbf328
--- /dev/null
+++ b/checkpoint-1272/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "up_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "q_proj",
+ "k_proj",
+ "down_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-1272/adapter_model.safetensors b/checkpoint-1272/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fc5b9d288686bcfc0dd8fef5f40baf5e7a82badf
--- /dev/null
+++ b/checkpoint-1272/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:daf3ced3bd8b21263fefde6234932a6f73d3a1191d93694a7382d35b17c0be53
+size 10829849744
diff --git a/checkpoint-1272/global_step1273/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-1272/global_step1273/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b01dfc5fb36aa6b8452c270e4f316a02c7d89e1e
--- /dev/null
+++ b/checkpoint-1272/global_step1273/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1932fb03b9d091d1c7fd843aca2c701c5c5b6438957e8cf859f4bcfd08a72695
+size 21659418140
diff --git a/checkpoint-1272/global_step1273/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-1272/global_step1273/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0f40ad0726ed072e7e577f321938f160526824f6
--- /dev/null
+++ b/checkpoint-1272/global_step1273/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6faf285b19c3eb0f8f139b00e3bda4fcd6dabbb4a9930a6e23742c870491827
+size 21659457372
diff --git a/checkpoint-1272/global_step1273/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-1272/global_step1273/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..0aec61468337bd63d2eaaa0495866bcf18fd640d
--- /dev/null
+++ b/checkpoint-1272/global_step1273/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31338c47d70c8de6ee30a7e92d617deede328013a3caa1ccdd4fcc7a6e3fd2cb
+size 21659417820
diff --git a/checkpoint-1272/global_step1273/mp_rank_00_model_states.pt b/checkpoint-1272/global_step1273/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b38e8f60e99d1805cdd3564c1bc0c0d74f4c2a2e
--- /dev/null
+++ b/checkpoint-1272/global_step1273/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6b6772e045a5cb580a3d24459c28981e76c2cbdf0b6a5d04cfe395a3a7b38fe
+size 11918643933
diff --git a/checkpoint-1272/latest b/checkpoint-1272/latest
new file mode 100644
index 0000000000000000000000000000000000000000..4daa77dac401c2d1ff684a33034735b7d056416e
--- /dev/null
+++ b/checkpoint-1272/latest
@@ -0,0 +1 @@
+global_step1273
\ No newline at end of file
diff --git a/checkpoint-1272/rng_state_0.pth b/checkpoint-1272/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e54eb430e8e13c6f2c081473775d427dbaa59e05
--- /dev/null
+++ b/checkpoint-1272/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:029c12466068440573cbe2e52e888af4cb4f676c48ad2727e053629d857a31ef
+size 14768
diff --git a/checkpoint-1272/rng_state_1.pth b/checkpoint-1272/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6570fdacf5c7764891aaaf0265ad8e2a37d2636d
--- /dev/null
+++ b/checkpoint-1272/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a574e42064d50c7d210a9ce4da886390eb8cf932aa6ff0d5d998299bc52456f
+size 14768
diff --git a/checkpoint-1272/rng_state_2.pth b/checkpoint-1272/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..34aa3a9daf75c010c2e1f9927a594613ac4332cf
--- /dev/null
+++ b/checkpoint-1272/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1589739795a553c3a995ad6c2c7f0222b1955aaa2573f62ea00bb57f2ad34cc
+size 14768
diff --git a/checkpoint-1272/scheduler.pt b/checkpoint-1272/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6611a6496731599d3dbafa8ede45d514c6522713
--- /dev/null
+++ b/checkpoint-1272/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cbb89639618c0c99d314e968dcb22ad54dca9cb8b379c634543095eb6766cc9f
+size 1064
diff --git a/checkpoint-1272/special_tokens_map.json b/checkpoint-1272/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-1272/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-1272/tokenizer.json b/checkpoint-1272/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-1272/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-1272/tokenizer_config.json b/checkpoint-1272/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b
--- /dev/null
+++ b/checkpoint-1272/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-1272/trainer_state.json b/checkpoint-1272/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..deb1bfa6550dadb581ca18c802e3cb93f0821c1a
--- /dev/null
+++ b/checkpoint-1272/trainer_state.json
@@ -0,0 +1,8937 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 5.996865203761756,
+ "eval_steps": 500,
+ "global_step": 1272,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.004702194357366771,
+ "grad_norm": 3.1606569290161133,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 1.0072,
+ "step": 1
+ },
+ {
+ "epoch": 0.009404388714733543,
+ "grad_norm": 3.2058725357055664,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 1.0134,
+ "step": 2
+ },
+ {
+ "epoch": 0.014106583072100314,
+ "grad_norm": 2.636291265487671,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 0.9635,
+ "step": 3
+ },
+ {
+ "epoch": 0.018808777429467086,
+ "grad_norm": 2.708746910095215,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 1.0068,
+ "step": 4
+ },
+ {
+ "epoch": 0.023510971786833857,
+ "grad_norm": 2.8948426246643066,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 0.9608,
+ "step": 5
+ },
+ {
+ "epoch": 0.02821316614420063,
+ "grad_norm": 2.8740086555480957,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 0.9896,
+ "step": 6
+ },
+ {
+ "epoch": 0.032915360501567396,
+ "grad_norm": 2.8338170051574707,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 0.9098,
+ "step": 7
+ },
+ {
+ "epoch": 0.03761755485893417,
+ "grad_norm": 2.7783002853393555,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 0.9733,
+ "step": 8
+ },
+ {
+ "epoch": 0.04231974921630094,
+ "grad_norm": 3.043574333190918,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 0.9943,
+ "step": 9
+ },
+ {
+ "epoch": 0.047021943573667714,
+ "grad_norm": 3.142383337020874,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 0.9475,
+ "step": 10
+ },
+ {
+ "epoch": 0.05172413793103448,
+ "grad_norm": 2.9817280769348145,
+ "learning_rate": 5.5e-07,
+ "loss": 0.9701,
+ "step": 11
+ },
+ {
+ "epoch": 0.05642633228840126,
+ "grad_norm": 2.95699405670166,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 0.9983,
+ "step": 12
+ },
+ {
+ "epoch": 0.061128526645768025,
+ "grad_norm": 2.8782453536987305,
+ "learning_rate": 6.5e-07,
+ "loss": 0.9502,
+ "step": 13
+ },
+ {
+ "epoch": 0.06583072100313479,
+ "grad_norm": 2.6715071201324463,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 0.9436,
+ "step": 14
+ },
+ {
+ "epoch": 0.07053291536050156,
+ "grad_norm": 3.869649648666382,
+ "learning_rate": 7.5e-07,
+ "loss": 0.9692,
+ "step": 15
+ },
+ {
+ "epoch": 0.07523510971786834,
+ "grad_norm": 3.060220956802368,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 0.9258,
+ "step": 16
+ },
+ {
+ "epoch": 0.07993730407523511,
+ "grad_norm": 2.8922741413116455,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 0.9719,
+ "step": 17
+ },
+ {
+ "epoch": 0.08463949843260188,
+ "grad_norm": 2.7857820987701416,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 0.9072,
+ "step": 18
+ },
+ {
+ "epoch": 0.08934169278996865,
+ "grad_norm": 2.9753293991088867,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 0.9032,
+ "step": 19
+ },
+ {
+ "epoch": 0.09404388714733543,
+ "grad_norm": 2.7989683151245117,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 0.8887,
+ "step": 20
+ },
+ {
+ "epoch": 0.0987460815047022,
+ "grad_norm": 2.3953049182891846,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 0.8968,
+ "step": 21
+ },
+ {
+ "epoch": 0.10344827586206896,
+ "grad_norm": 2.643731117248535,
+ "learning_rate": 1.1e-06,
+ "loss": 0.8501,
+ "step": 22
+ },
+ {
+ "epoch": 0.10815047021943573,
+ "grad_norm": 2.3679006099700928,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 0.8476,
+ "step": 23
+ },
+ {
+ "epoch": 0.11285266457680251,
+ "grad_norm": 2.5935540199279785,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 0.8095,
+ "step": 24
+ },
+ {
+ "epoch": 0.11755485893416928,
+ "grad_norm": 2.510300636291504,
+ "learning_rate": 1.25e-06,
+ "loss": 0.8099,
+ "step": 25
+ },
+ {
+ "epoch": 0.12225705329153605,
+ "grad_norm": 2.372344970703125,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7869,
+ "step": 26
+ },
+ {
+ "epoch": 0.12695924764890282,
+ "grad_norm": 2.303426504135132,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.7758,
+ "step": 27
+ },
+ {
+ "epoch": 0.13166144200626959,
+ "grad_norm": 1.9017939567565918,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.7498,
+ "step": 28
+ },
+ {
+ "epoch": 0.13636363636363635,
+ "grad_norm": 1.8810580968856812,
+ "learning_rate": 1.45e-06,
+ "loss": 0.7878,
+ "step": 29
+ },
+ {
+ "epoch": 0.14106583072100312,
+ "grad_norm": 1.7797424793243408,
+ "learning_rate": 1.5e-06,
+ "loss": 0.7747,
+ "step": 30
+ },
+ {
+ "epoch": 0.14576802507836992,
+ "grad_norm": 1.5053879022598267,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.7735,
+ "step": 31
+ },
+ {
+ "epoch": 0.15047021943573669,
+ "grad_norm": 1.4909234046936035,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.7654,
+ "step": 32
+ },
+ {
+ "epoch": 0.15517241379310345,
+ "grad_norm": 1.36083984375,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.6895,
+ "step": 33
+ },
+ {
+ "epoch": 0.15987460815047022,
+ "grad_norm": 1.536014199256897,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.675,
+ "step": 34
+ },
+ {
+ "epoch": 0.164576802507837,
+ "grad_norm": 1.3426779508590698,
+ "learning_rate": 1.75e-06,
+ "loss": 0.7652,
+ "step": 35
+ },
+ {
+ "epoch": 0.16927899686520376,
+ "grad_norm": 1.4900612831115723,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.6863,
+ "step": 36
+ },
+ {
+ "epoch": 0.17398119122257052,
+ "grad_norm": 1.181241750717163,
+ "learning_rate": 1.85e-06,
+ "loss": 0.7136,
+ "step": 37
+ },
+ {
+ "epoch": 0.1786833855799373,
+ "grad_norm": 1.461419701576233,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.7606,
+ "step": 38
+ },
+ {
+ "epoch": 0.1833855799373041,
+ "grad_norm": 1.04817795753479,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.6829,
+ "step": 39
+ },
+ {
+ "epoch": 0.18808777429467086,
+ "grad_norm": 1.0499993562698364,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.7144,
+ "step": 40
+ },
+ {
+ "epoch": 0.19278996865203762,
+ "grad_norm": 0.9935064315795898,
+ "learning_rate": 2.05e-06,
+ "loss": 0.6736,
+ "step": 41
+ },
+ {
+ "epoch": 0.1974921630094044,
+ "grad_norm": 0.9919099807739258,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.7151,
+ "step": 42
+ },
+ {
+ "epoch": 0.20219435736677116,
+ "grad_norm": 0.919556200504303,
+ "learning_rate": 2.15e-06,
+ "loss": 0.6847,
+ "step": 43
+ },
+ {
+ "epoch": 0.20689655172413793,
+ "grad_norm": 1.4762015342712402,
+ "learning_rate": 2.2e-06,
+ "loss": 0.6694,
+ "step": 44
+ },
+ {
+ "epoch": 0.2115987460815047,
+ "grad_norm": 0.9243163466453552,
+ "learning_rate": 2.25e-06,
+ "loss": 0.6489,
+ "step": 45
+ },
+ {
+ "epoch": 0.21630094043887146,
+ "grad_norm": 0.7614469528198242,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.6568,
+ "step": 46
+ },
+ {
+ "epoch": 0.22100313479623823,
+ "grad_norm": 0.7543922662734985,
+ "learning_rate": 2.35e-06,
+ "loss": 0.6359,
+ "step": 47
+ },
+ {
+ "epoch": 0.22570532915360503,
+ "grad_norm": 0.7558912038803101,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.6231,
+ "step": 48
+ },
+ {
+ "epoch": 0.2304075235109718,
+ "grad_norm": 0.7822129130363464,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.6691,
+ "step": 49
+ },
+ {
+ "epoch": 0.23510971786833856,
+ "grad_norm": 0.8646999597549438,
+ "learning_rate": 2.5e-06,
+ "loss": 0.682,
+ "step": 50
+ },
+ {
+ "epoch": 0.23981191222570533,
+ "grad_norm": 0.8824774622917175,
+ "learning_rate": 2.55e-06,
+ "loss": 0.6805,
+ "step": 51
+ },
+ {
+ "epoch": 0.2445141065830721,
+ "grad_norm": 0.7697399258613586,
+ "learning_rate": 2.6e-06,
+ "loss": 0.6368,
+ "step": 52
+ },
+ {
+ "epoch": 0.24921630094043887,
+ "grad_norm": 0.6522512435913086,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.6367,
+ "step": 53
+ },
+ {
+ "epoch": 0.25391849529780564,
+ "grad_norm": 0.6172305941581726,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.6291,
+ "step": 54
+ },
+ {
+ "epoch": 0.25862068965517243,
+ "grad_norm": 0.7860460877418518,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.6736,
+ "step": 55
+ },
+ {
+ "epoch": 0.26332288401253917,
+ "grad_norm": 0.6474862694740295,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.6365,
+ "step": 56
+ },
+ {
+ "epoch": 0.26802507836990597,
+ "grad_norm": 0.6867114901542664,
+ "learning_rate": 2.85e-06,
+ "loss": 0.6397,
+ "step": 57
+ },
+ {
+ "epoch": 0.2727272727272727,
+ "grad_norm": 0.7056852579116821,
+ "learning_rate": 2.9e-06,
+ "loss": 0.6138,
+ "step": 58
+ },
+ {
+ "epoch": 0.2774294670846395,
+ "grad_norm": 0.6615664958953857,
+ "learning_rate": 2.95e-06,
+ "loss": 0.6482,
+ "step": 59
+ },
+ {
+ "epoch": 0.28213166144200624,
+ "grad_norm": 0.6649022102355957,
+ "learning_rate": 3e-06,
+ "loss": 0.6745,
+ "step": 60
+ },
+ {
+ "epoch": 0.28683385579937304,
+ "grad_norm": 0.850848913192749,
+ "learning_rate": 3.05e-06,
+ "loss": 0.5956,
+ "step": 61
+ },
+ {
+ "epoch": 0.29153605015673983,
+ "grad_norm": 0.5983562469482422,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.5894,
+ "step": 62
+ },
+ {
+ "epoch": 0.2962382445141066,
+ "grad_norm": 0.6286782622337341,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.6329,
+ "step": 63
+ },
+ {
+ "epoch": 0.30094043887147337,
+ "grad_norm": 0.5919945240020752,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.6402,
+ "step": 64
+ },
+ {
+ "epoch": 0.3056426332288401,
+ "grad_norm": 0.5632765889167786,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.5862,
+ "step": 65
+ },
+ {
+ "epoch": 0.3103448275862069,
+ "grad_norm": 0.7692590951919556,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.6031,
+ "step": 66
+ },
+ {
+ "epoch": 0.31504702194357365,
+ "grad_norm": 0.7313893437385559,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.6312,
+ "step": 67
+ },
+ {
+ "epoch": 0.31974921630094044,
+ "grad_norm": 0.6097120642662048,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.5986,
+ "step": 68
+ },
+ {
+ "epoch": 0.32445141065830724,
+ "grad_norm": 0.5853808522224426,
+ "learning_rate": 3.45e-06,
+ "loss": 0.5847,
+ "step": 69
+ },
+ {
+ "epoch": 0.329153605015674,
+ "grad_norm": 0.6093555092811584,
+ "learning_rate": 3.5e-06,
+ "loss": 0.6552,
+ "step": 70
+ },
+ {
+ "epoch": 0.3338557993730408,
+ "grad_norm": 0.6106334328651428,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.6196,
+ "step": 71
+ },
+ {
+ "epoch": 0.3385579937304075,
+ "grad_norm": 0.9254828691482544,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.6005,
+ "step": 72
+ },
+ {
+ "epoch": 0.3432601880877743,
+ "grad_norm": 0.5471694469451904,
+ "learning_rate": 3.65e-06,
+ "loss": 0.5907,
+ "step": 73
+ },
+ {
+ "epoch": 0.34796238244514105,
+ "grad_norm": 0.6204228401184082,
+ "learning_rate": 3.7e-06,
+ "loss": 0.6079,
+ "step": 74
+ },
+ {
+ "epoch": 0.35266457680250785,
+ "grad_norm": 0.52458256483078,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.6001,
+ "step": 75
+ },
+ {
+ "epoch": 0.3573667711598746,
+ "grad_norm": 0.5356763601303101,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.5987,
+ "step": 76
+ },
+ {
+ "epoch": 0.3620689655172414,
+ "grad_norm": 0.5408467054367065,
+ "learning_rate": 3.85e-06,
+ "loss": 0.6104,
+ "step": 77
+ },
+ {
+ "epoch": 0.3667711598746082,
+ "grad_norm": 0.5075871348381042,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.5569,
+ "step": 78
+ },
+ {
+ "epoch": 0.3714733542319749,
+ "grad_norm": 0.8474109768867493,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6195,
+ "step": 79
+ },
+ {
+ "epoch": 0.3761755485893417,
+ "grad_norm": 0.4750897288322449,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.5399,
+ "step": 80
+ },
+ {
+ "epoch": 0.38087774294670845,
+ "grad_norm": 0.5082002878189087,
+ "learning_rate": 4.05e-06,
+ "loss": 0.5997,
+ "step": 81
+ },
+ {
+ "epoch": 0.38557993730407525,
+ "grad_norm": 0.5343796014785767,
+ "learning_rate": 4.1e-06,
+ "loss": 0.5704,
+ "step": 82
+ },
+ {
+ "epoch": 0.390282131661442,
+ "grad_norm": 0.520311713218689,
+ "learning_rate": 4.15e-06,
+ "loss": 0.5818,
+ "step": 83
+ },
+ {
+ "epoch": 0.3949843260188088,
+ "grad_norm": 0.5292978286743164,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.5852,
+ "step": 84
+ },
+ {
+ "epoch": 0.3996865203761755,
+ "grad_norm": 0.539886474609375,
+ "learning_rate": 4.25e-06,
+ "loss": 0.6057,
+ "step": 85
+ },
+ {
+ "epoch": 0.4043887147335423,
+ "grad_norm": 0.6468827128410339,
+ "learning_rate": 4.3e-06,
+ "loss": 0.6122,
+ "step": 86
+ },
+ {
+ "epoch": 0.4090909090909091,
+ "grad_norm": 0.5537365078926086,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.5652,
+ "step": 87
+ },
+ {
+ "epoch": 0.41379310344827586,
+ "grad_norm": 0.6226018667221069,
+ "learning_rate": 4.4e-06,
+ "loss": 0.5884,
+ "step": 88
+ },
+ {
+ "epoch": 0.41849529780564265,
+ "grad_norm": 0.5016945004463196,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.5877,
+ "step": 89
+ },
+ {
+ "epoch": 0.4231974921630094,
+ "grad_norm": 0.5059167146682739,
+ "learning_rate": 4.5e-06,
+ "loss": 0.5676,
+ "step": 90
+ },
+ {
+ "epoch": 0.4278996865203762,
+ "grad_norm": 0.47521743178367615,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.5929,
+ "step": 91
+ },
+ {
+ "epoch": 0.43260188087774293,
+ "grad_norm": 0.531306266784668,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.5983,
+ "step": 92
+ },
+ {
+ "epoch": 0.4373040752351097,
+ "grad_norm": 0.4965567886829376,
+ "learning_rate": 4.65e-06,
+ "loss": 0.5279,
+ "step": 93
+ },
+ {
+ "epoch": 0.44200626959247646,
+ "grad_norm": 0.5125988125801086,
+ "learning_rate": 4.7e-06,
+ "loss": 0.5436,
+ "step": 94
+ },
+ {
+ "epoch": 0.44670846394984326,
+ "grad_norm": 0.557763934135437,
+ "learning_rate": 4.75e-06,
+ "loss": 0.5496,
+ "step": 95
+ },
+ {
+ "epoch": 0.45141065830721006,
+ "grad_norm": 0.6993274092674255,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.5498,
+ "step": 96
+ },
+ {
+ "epoch": 0.4561128526645768,
+ "grad_norm": 0.5485453009605408,
+ "learning_rate": 4.85e-06,
+ "loss": 0.5552,
+ "step": 97
+ },
+ {
+ "epoch": 0.4608150470219436,
+ "grad_norm": 1.9821522235870361,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.569,
+ "step": 98
+ },
+ {
+ "epoch": 0.46551724137931033,
+ "grad_norm": 0.6074144840240479,
+ "learning_rate": 4.95e-06,
+ "loss": 0.5546,
+ "step": 99
+ },
+ {
+ "epoch": 0.4702194357366771,
+ "grad_norm": 0.5404040813446045,
+ "learning_rate": 5e-06,
+ "loss": 0.5775,
+ "step": 100
+ },
+ {
+ "epoch": 0.47492163009404387,
+ "grad_norm": 0.500438928604126,
+ "learning_rate": 4.9999910183883085e-06,
+ "loss": 0.5569,
+ "step": 101
+ },
+ {
+ "epoch": 0.47962382445141066,
+ "grad_norm": 0.5036981701850891,
+ "learning_rate": 4.999964073617768e-06,
+ "loss": 0.5663,
+ "step": 102
+ },
+ {
+ "epoch": 0.4843260188087774,
+ "grad_norm": 0.4537642300128937,
+ "learning_rate": 4.999919165881985e-06,
+ "loss": 0.5527,
+ "step": 103
+ },
+ {
+ "epoch": 0.4890282131661442,
+ "grad_norm": 0.49653521180152893,
+ "learning_rate": 4.999856295503635e-06,
+ "loss": 0.563,
+ "step": 104
+ },
+ {
+ "epoch": 0.493730407523511,
+ "grad_norm": 0.46847566962242126,
+ "learning_rate": 4.9997754629344596e-06,
+ "loss": 0.5425,
+ "step": 105
+ },
+ {
+ "epoch": 0.49843260188087773,
+ "grad_norm": 0.5192411541938782,
+ "learning_rate": 4.999676668755263e-06,
+ "loss": 0.5315,
+ "step": 106
+ },
+ {
+ "epoch": 0.5031347962382445,
+ "grad_norm": 0.5170287489891052,
+ "learning_rate": 4.999559913675912e-06,
+ "loss": 0.5627,
+ "step": 107
+ },
+ {
+ "epoch": 0.5078369905956113,
+ "grad_norm": 0.47297438979148865,
+ "learning_rate": 4.999425198535325e-06,
+ "loss": 0.5432,
+ "step": 108
+ },
+ {
+ "epoch": 0.512539184952978,
+ "grad_norm": 0.4873776137828827,
+ "learning_rate": 4.999272524301469e-06,
+ "loss": 0.5473,
+ "step": 109
+ },
+ {
+ "epoch": 0.5172413793103449,
+ "grad_norm": 0.5432935357093811,
+ "learning_rate": 4.9991018920713505e-06,
+ "loss": 0.5642,
+ "step": 110
+ },
+ {
+ "epoch": 0.5219435736677116,
+ "grad_norm": 0.4850105345249176,
+ "learning_rate": 4.9989133030710154e-06,
+ "loss": 0.548,
+ "step": 111
+ },
+ {
+ "epoch": 0.5266457680250783,
+ "grad_norm": 0.9399585723876953,
+ "learning_rate": 4.9987067586555275e-06,
+ "loss": 0.5471,
+ "step": 112
+ },
+ {
+ "epoch": 0.5313479623824452,
+ "grad_norm": 0.5167811512947083,
+ "learning_rate": 4.998482260308969e-06,
+ "loss": 0.5648,
+ "step": 113
+ },
+ {
+ "epoch": 0.5360501567398119,
+ "grad_norm": 0.5069029927253723,
+ "learning_rate": 4.998239809644427e-06,
+ "loss": 0.5568,
+ "step": 114
+ },
+ {
+ "epoch": 0.5407523510971787,
+ "grad_norm": 0.8738563656806946,
+ "learning_rate": 4.9979794084039755e-06,
+ "loss": 0.5719,
+ "step": 115
+ },
+ {
+ "epoch": 0.5454545454545454,
+ "grad_norm": 0.5216553807258606,
+ "learning_rate": 4.997701058458677e-06,
+ "loss": 0.5309,
+ "step": 116
+ },
+ {
+ "epoch": 0.5501567398119123,
+ "grad_norm": 0.9678344130516052,
+ "learning_rate": 4.997404761808554e-06,
+ "loss": 0.5645,
+ "step": 117
+ },
+ {
+ "epoch": 0.554858934169279,
+ "grad_norm": 0.496598482131958,
+ "learning_rate": 4.9970905205825845e-06,
+ "loss": 0.5711,
+ "step": 118
+ },
+ {
+ "epoch": 0.5595611285266457,
+ "grad_norm": 0.4745199680328369,
+ "learning_rate": 4.996758337038683e-06,
+ "loss": 0.5613,
+ "step": 119
+ },
+ {
+ "epoch": 0.5642633228840125,
+ "grad_norm": 0.5595977902412415,
+ "learning_rate": 4.996408213563684e-06,
+ "loss": 0.5559,
+ "step": 120
+ },
+ {
+ "epoch": 0.5689655172413793,
+ "grad_norm": 0.4743712544441223,
+ "learning_rate": 4.996040152673326e-06,
+ "loss": 0.5228,
+ "step": 121
+ },
+ {
+ "epoch": 0.5736677115987461,
+ "grad_norm": 0.5418100953102112,
+ "learning_rate": 4.995654157012233e-06,
+ "loss": 0.536,
+ "step": 122
+ },
+ {
+ "epoch": 0.5783699059561128,
+ "grad_norm": 0.521977424621582,
+ "learning_rate": 4.995250229353895e-06,
+ "loss": 0.5305,
+ "step": 123
+ },
+ {
+ "epoch": 0.5830721003134797,
+ "grad_norm": 0.5062761902809143,
+ "learning_rate": 4.99482837260065e-06,
+ "loss": 0.5417,
+ "step": 124
+ },
+ {
+ "epoch": 0.5877742946708464,
+ "grad_norm": 0.5895913243293762,
+ "learning_rate": 4.99438858978366e-06,
+ "loss": 0.573,
+ "step": 125
+ },
+ {
+ "epoch": 0.5924764890282131,
+ "grad_norm": 0.5442466139793396,
+ "learning_rate": 4.993930884062892e-06,
+ "loss": 0.5563,
+ "step": 126
+ },
+ {
+ "epoch": 0.5971786833855799,
+ "grad_norm": 0.5130571722984314,
+ "learning_rate": 4.993455258727094e-06,
+ "loss": 0.5549,
+ "step": 127
+ },
+ {
+ "epoch": 0.6018808777429467,
+ "grad_norm": 0.5579081773757935,
+ "learning_rate": 4.992961717193773e-06,
+ "loss": 0.5554,
+ "step": 128
+ },
+ {
+ "epoch": 0.6065830721003135,
+ "grad_norm": 0.6375890374183655,
+ "learning_rate": 4.9924502630091655e-06,
+ "loss": 0.5626,
+ "step": 129
+ },
+ {
+ "epoch": 0.6112852664576802,
+ "grad_norm": 0.5129190683364868,
+ "learning_rate": 4.99192089984822e-06,
+ "loss": 0.5493,
+ "step": 130
+ },
+ {
+ "epoch": 0.6159874608150471,
+ "grad_norm": 0.5293419361114502,
+ "learning_rate": 4.9913736315145614e-06,
+ "loss": 0.5565,
+ "step": 131
+ },
+ {
+ "epoch": 0.6206896551724138,
+ "grad_norm": 0.6502572298049927,
+ "learning_rate": 4.990808461940474e-06,
+ "loss": 0.5358,
+ "step": 132
+ },
+ {
+ "epoch": 0.6253918495297806,
+ "grad_norm": 0.5450296998023987,
+ "learning_rate": 4.990225395186862e-06,
+ "loss": 0.5421,
+ "step": 133
+ },
+ {
+ "epoch": 0.6300940438871473,
+ "grad_norm": 0.45506399869918823,
+ "learning_rate": 4.9896244354432314e-06,
+ "loss": 0.5396,
+ "step": 134
+ },
+ {
+ "epoch": 0.6347962382445141,
+ "grad_norm": 0.5095545649528503,
+ "learning_rate": 4.98900558702765e-06,
+ "loss": 0.5486,
+ "step": 135
+ },
+ {
+ "epoch": 0.6394984326018809,
+ "grad_norm": 0.4836446940898895,
+ "learning_rate": 4.9883688543867225e-06,
+ "loss": 0.5596,
+ "step": 136
+ },
+ {
+ "epoch": 0.6442006269592476,
+ "grad_norm": 0.5253512859344482,
+ "learning_rate": 4.987714242095558e-06,
+ "loss": 0.5308,
+ "step": 137
+ },
+ {
+ "epoch": 0.6489028213166145,
+ "grad_norm": 0.8280164003372192,
+ "learning_rate": 4.9870417548577355e-06,
+ "loss": 0.5349,
+ "step": 138
+ },
+ {
+ "epoch": 0.6536050156739812,
+ "grad_norm": 0.4729730188846588,
+ "learning_rate": 4.9863513975052696e-06,
+ "loss": 0.5416,
+ "step": 139
+ },
+ {
+ "epoch": 0.658307210031348,
+ "grad_norm": 0.5932718515396118,
+ "learning_rate": 4.985643174998578e-06,
+ "loss": 0.5638,
+ "step": 140
+ },
+ {
+ "epoch": 0.6630094043887147,
+ "grad_norm": 0.5187026262283325,
+ "learning_rate": 4.984917092426445e-06,
+ "loss": 0.5507,
+ "step": 141
+ },
+ {
+ "epoch": 0.6677115987460815,
+ "grad_norm": 0.5024245977401733,
+ "learning_rate": 4.984173155005982e-06,
+ "loss": 0.5406,
+ "step": 142
+ },
+ {
+ "epoch": 0.6724137931034483,
+ "grad_norm": 0.4735509157180786,
+ "learning_rate": 4.983411368082597e-06,
+ "loss": 0.5431,
+ "step": 143
+ },
+ {
+ "epoch": 0.677115987460815,
+ "grad_norm": 0.5040024518966675,
+ "learning_rate": 4.982631737129948e-06,
+ "loss": 0.5291,
+ "step": 144
+ },
+ {
+ "epoch": 0.6818181818181818,
+ "grad_norm": 0.47764894366264343,
+ "learning_rate": 4.98183426774991e-06,
+ "loss": 0.5677,
+ "step": 145
+ },
+ {
+ "epoch": 0.6865203761755486,
+ "grad_norm": 0.5211489796638489,
+ "learning_rate": 4.981018965672529e-06,
+ "loss": 0.566,
+ "step": 146
+ },
+ {
+ "epoch": 0.6912225705329154,
+ "grad_norm": 1.022007942199707,
+ "learning_rate": 4.98018583675599e-06,
+ "loss": 0.5476,
+ "step": 147
+ },
+ {
+ "epoch": 0.6959247648902821,
+ "grad_norm": 0.5263912677764893,
+ "learning_rate": 4.979334886986562e-06,
+ "loss": 0.5473,
+ "step": 148
+ },
+ {
+ "epoch": 0.700626959247649,
+ "grad_norm": 0.5014091730117798,
+ "learning_rate": 4.978466122478567e-06,
+ "loss": 0.5642,
+ "step": 149
+ },
+ {
+ "epoch": 0.7053291536050157,
+ "grad_norm": 0.5003350973129272,
+ "learning_rate": 4.97757954947433e-06,
+ "loss": 0.5311,
+ "step": 150
+ },
+ {
+ "epoch": 0.7100313479623824,
+ "grad_norm": 0.5010690093040466,
+ "learning_rate": 4.976675174344132e-06,
+ "loss": 0.5469,
+ "step": 151
+ },
+ {
+ "epoch": 0.7147335423197492,
+ "grad_norm": 0.45779237151145935,
+ "learning_rate": 4.975753003586172e-06,
+ "loss": 0.5273,
+ "step": 152
+ },
+ {
+ "epoch": 0.719435736677116,
+ "grad_norm": 0.6231539845466614,
+ "learning_rate": 4.974813043826513e-06,
+ "loss": 0.5182,
+ "step": 153
+ },
+ {
+ "epoch": 0.7241379310344828,
+ "grad_norm": 0.5361394286155701,
+ "learning_rate": 4.973855301819039e-06,
+ "loss": 0.5372,
+ "step": 154
+ },
+ {
+ "epoch": 0.7288401253918495,
+ "grad_norm": 0.5193538665771484,
+ "learning_rate": 4.972879784445402e-06,
+ "loss": 0.5201,
+ "step": 155
+ },
+ {
+ "epoch": 0.7335423197492164,
+ "grad_norm": 0.47956809401512146,
+ "learning_rate": 4.971886498714978e-06,
+ "loss": 0.5402,
+ "step": 156
+ },
+ {
+ "epoch": 0.7382445141065831,
+ "grad_norm": 0.5303016901016235,
+ "learning_rate": 4.97087545176481e-06,
+ "loss": 0.5174,
+ "step": 157
+ },
+ {
+ "epoch": 0.7429467084639498,
+ "grad_norm": 0.5002286434173584,
+ "learning_rate": 4.9698466508595655e-06,
+ "loss": 0.5453,
+ "step": 158
+ },
+ {
+ "epoch": 0.7476489028213166,
+ "grad_norm": 0.6070297360420227,
+ "learning_rate": 4.9688001033914756e-06,
+ "loss": 0.5327,
+ "step": 159
+ },
+ {
+ "epoch": 0.7523510971786834,
+ "grad_norm": 0.5436793565750122,
+ "learning_rate": 4.967735816880286e-06,
+ "loss": 0.544,
+ "step": 160
+ },
+ {
+ "epoch": 0.7570532915360502,
+ "grad_norm": 0.538012683391571,
+ "learning_rate": 4.966653798973205e-06,
+ "loss": 0.5233,
+ "step": 161
+ },
+ {
+ "epoch": 0.7617554858934169,
+ "grad_norm": 0.4916169345378876,
+ "learning_rate": 4.965554057444842e-06,
+ "loss": 0.5168,
+ "step": 162
+ },
+ {
+ "epoch": 0.7664576802507836,
+ "grad_norm": 0.48281437158584595,
+ "learning_rate": 4.964436600197161e-06,
+ "loss": 0.5393,
+ "step": 163
+ },
+ {
+ "epoch": 0.7711598746081505,
+ "grad_norm": 0.5184990167617798,
+ "learning_rate": 4.963301435259413e-06,
+ "loss": 0.5085,
+ "step": 164
+ },
+ {
+ "epoch": 0.7758620689655172,
+ "grad_norm": 0.4706438183784485,
+ "learning_rate": 4.962148570788088e-06,
+ "loss": 0.5299,
+ "step": 165
+ },
+ {
+ "epoch": 0.780564263322884,
+ "grad_norm": 0.6550764441490173,
+ "learning_rate": 4.96097801506685e-06,
+ "loss": 0.5192,
+ "step": 166
+ },
+ {
+ "epoch": 0.7852664576802508,
+ "grad_norm": 0.5386581420898438,
+ "learning_rate": 4.959789776506482e-06,
+ "loss": 0.5258,
+ "step": 167
+ },
+ {
+ "epoch": 0.7899686520376176,
+ "grad_norm": 0.5060779452323914,
+ "learning_rate": 4.958583863644821e-06,
+ "loss": 0.5512,
+ "step": 168
+ },
+ {
+ "epoch": 0.7946708463949843,
+ "grad_norm": 0.47050032019615173,
+ "learning_rate": 4.9573602851466985e-06,
+ "loss": 0.5176,
+ "step": 169
+ },
+ {
+ "epoch": 0.799373040752351,
+ "grad_norm": 7.3139567375183105,
+ "learning_rate": 4.9561190498038815e-06,
+ "loss": 0.5381,
+ "step": 170
+ },
+ {
+ "epoch": 0.8040752351097179,
+ "grad_norm": 0.620528519153595,
+ "learning_rate": 4.954860166535005e-06,
+ "loss": 0.5299,
+ "step": 171
+ },
+ {
+ "epoch": 0.8087774294670846,
+ "grad_norm": 0.45067766308784485,
+ "learning_rate": 4.95358364438551e-06,
+ "loss": 0.5328,
+ "step": 172
+ },
+ {
+ "epoch": 0.8134796238244514,
+ "grad_norm": 0.6771508455276489,
+ "learning_rate": 4.952289492527576e-06,
+ "loss": 0.5601,
+ "step": 173
+ },
+ {
+ "epoch": 0.8181818181818182,
+ "grad_norm": 0.518925130367279,
+ "learning_rate": 4.9509777202600605e-06,
+ "loss": 0.494,
+ "step": 174
+ },
+ {
+ "epoch": 0.822884012539185,
+ "grad_norm": 0.5191988945007324,
+ "learning_rate": 4.949648337008425e-06,
+ "loss": 0.5425,
+ "step": 175
+ },
+ {
+ "epoch": 0.8275862068965517,
+ "grad_norm": 0.8600963354110718,
+ "learning_rate": 4.948301352324674e-06,
+ "loss": 0.5332,
+ "step": 176
+ },
+ {
+ "epoch": 0.8322884012539185,
+ "grad_norm": 0.5405915379524231,
+ "learning_rate": 4.946936775887281e-06,
+ "loss": 0.5276,
+ "step": 177
+ },
+ {
+ "epoch": 0.8369905956112853,
+ "grad_norm": 0.48730772733688354,
+ "learning_rate": 4.945554617501124e-06,
+ "loss": 0.5217,
+ "step": 178
+ },
+ {
+ "epoch": 0.841692789968652,
+ "grad_norm": 0.5092865824699402,
+ "learning_rate": 4.944154887097411e-06,
+ "loss": 0.5534,
+ "step": 179
+ },
+ {
+ "epoch": 0.8463949843260188,
+ "grad_norm": 0.4994933605194092,
+ "learning_rate": 4.942737594733608e-06,
+ "loss": 0.5242,
+ "step": 180
+ },
+ {
+ "epoch": 0.8510971786833855,
+ "grad_norm": 0.4554043412208557,
+ "learning_rate": 4.941302750593373e-06,
+ "loss": 0.5424,
+ "step": 181
+ },
+ {
+ "epoch": 0.8557993730407524,
+ "grad_norm": 0.4865265488624573,
+ "learning_rate": 4.939850364986475e-06,
+ "loss": 0.482,
+ "step": 182
+ },
+ {
+ "epoch": 0.8605015673981191,
+ "grad_norm": 0.5013875365257263,
+ "learning_rate": 4.938380448348725e-06,
+ "loss": 0.4908,
+ "step": 183
+ },
+ {
+ "epoch": 0.8652037617554859,
+ "grad_norm": 0.4997917115688324,
+ "learning_rate": 4.9368930112419e-06,
+ "loss": 0.5336,
+ "step": 184
+ },
+ {
+ "epoch": 0.8699059561128527,
+ "grad_norm": 0.4783482551574707,
+ "learning_rate": 4.935388064353665e-06,
+ "loss": 0.5338,
+ "step": 185
+ },
+ {
+ "epoch": 0.8746081504702194,
+ "grad_norm": 0.7221089005470276,
+ "learning_rate": 4.9338656184975e-06,
+ "loss": 0.5327,
+ "step": 186
+ },
+ {
+ "epoch": 0.8793103448275862,
+ "grad_norm": 0.48115843534469604,
+ "learning_rate": 4.932325684612618e-06,
+ "loss": 0.5408,
+ "step": 187
+ },
+ {
+ "epoch": 0.8840125391849529,
+ "grad_norm": 0.4940219223499298,
+ "learning_rate": 4.93076827376389e-06,
+ "loss": 0.5455,
+ "step": 188
+ },
+ {
+ "epoch": 0.8887147335423198,
+ "grad_norm": 0.4754747450351715,
+ "learning_rate": 4.9291933971417635e-06,
+ "loss": 0.542,
+ "step": 189
+ },
+ {
+ "epoch": 0.8934169278996865,
+ "grad_norm": 0.548713207244873,
+ "learning_rate": 4.9276010660621835e-06,
+ "loss": 0.5292,
+ "step": 190
+ },
+ {
+ "epoch": 0.8981191222570533,
+ "grad_norm": 0.7292612195014954,
+ "learning_rate": 4.925991291966508e-06,
+ "loss": 0.5073,
+ "step": 191
+ },
+ {
+ "epoch": 0.9028213166144201,
+ "grad_norm": 0.5254770517349243,
+ "learning_rate": 4.92436408642143e-06,
+ "loss": 0.5451,
+ "step": 192
+ },
+ {
+ "epoch": 0.9075235109717869,
+ "grad_norm": 0.47938767075538635,
+ "learning_rate": 4.9227194611188934e-06,
+ "loss": 0.5204,
+ "step": 193
+ },
+ {
+ "epoch": 0.9122257053291536,
+ "grad_norm": 0.6740232706069946,
+ "learning_rate": 4.921057427876007e-06,
+ "loss": 0.4928,
+ "step": 194
+ },
+ {
+ "epoch": 0.9169278996865203,
+ "grad_norm": 0.5455343723297119,
+ "learning_rate": 4.919377998634959e-06,
+ "loss": 0.5468,
+ "step": 195
+ },
+ {
+ "epoch": 0.9216300940438872,
+ "grad_norm": 0.5001958012580872,
+ "learning_rate": 4.917681185462934e-06,
+ "loss": 0.5339,
+ "step": 196
+ },
+ {
+ "epoch": 0.9263322884012539,
+ "grad_norm": 0.5084257125854492,
+ "learning_rate": 4.915967000552028e-06,
+ "loss": 0.5259,
+ "step": 197
+ },
+ {
+ "epoch": 0.9310344827586207,
+ "grad_norm": 0.4807164967060089,
+ "learning_rate": 4.914235456219154e-06,
+ "loss": 0.5204,
+ "step": 198
+ },
+ {
+ "epoch": 0.9357366771159875,
+ "grad_norm": 0.6099370718002319,
+ "learning_rate": 4.912486564905959e-06,
+ "loss": 0.544,
+ "step": 199
+ },
+ {
+ "epoch": 0.9404388714733543,
+ "grad_norm": 0.47461947798728943,
+ "learning_rate": 4.910720339178735e-06,
+ "loss": 0.5295,
+ "step": 200
+ },
+ {
+ "epoch": 0.945141065830721,
+ "grad_norm": 0.500136137008667,
+ "learning_rate": 4.908936791728323e-06,
+ "loss": 0.5321,
+ "step": 201
+ },
+ {
+ "epoch": 0.9498432601880877,
+ "grad_norm": 0.5235631465911865,
+ "learning_rate": 4.907135935370027e-06,
+ "loss": 0.5338,
+ "step": 202
+ },
+ {
+ "epoch": 0.9545454545454546,
+ "grad_norm": 0.9285804629325867,
+ "learning_rate": 4.905317783043523e-06,
+ "loss": 0.5393,
+ "step": 203
+ },
+ {
+ "epoch": 0.9592476489028213,
+ "grad_norm": 0.4834178388118744,
+ "learning_rate": 4.9034823478127605e-06,
+ "loss": 0.5211,
+ "step": 204
+ },
+ {
+ "epoch": 0.9639498432601881,
+ "grad_norm": 0.4830580949783325,
+ "learning_rate": 4.901629642865872e-06,
+ "loss": 0.4986,
+ "step": 205
+ },
+ {
+ "epoch": 0.9686520376175548,
+ "grad_norm": 0.49718615412712097,
+ "learning_rate": 4.89975968151508e-06,
+ "loss": 0.5204,
+ "step": 206
+ },
+ {
+ "epoch": 0.9733542319749217,
+ "grad_norm": 0.5056726336479187,
+ "learning_rate": 4.8978724771965965e-06,
+ "loss": 0.5133,
+ "step": 207
+ },
+ {
+ "epoch": 0.9780564263322884,
+ "grad_norm": 0.7357563376426697,
+ "learning_rate": 4.895968043470532e-06,
+ "loss": 0.5307,
+ "step": 208
+ },
+ {
+ "epoch": 0.9827586206896551,
+ "grad_norm": 0.515610933303833,
+ "learning_rate": 4.894046394020794e-06,
+ "loss": 0.4955,
+ "step": 209
+ },
+ {
+ "epoch": 0.987460815047022,
+ "grad_norm": 0.5124618411064148,
+ "learning_rate": 4.892107542654988e-06,
+ "loss": 0.526,
+ "step": 210
+ },
+ {
+ "epoch": 0.9921630094043887,
+ "grad_norm": 0.5059565901756287,
+ "learning_rate": 4.890151503304325e-06,
+ "loss": 0.5473,
+ "step": 211
+ },
+ {
+ "epoch": 0.9968652037617555,
+ "grad_norm": 0.4806717336177826,
+ "learning_rate": 4.88817829002351e-06,
+ "loss": 0.5212,
+ "step": 212
+ },
+ {
+ "epoch": 1.0047021943573669,
+ "grad_norm": 0.9454345703125,
+ "learning_rate": 4.886187916990653e-06,
+ "loss": 1.0566,
+ "step": 213
+ },
+ {
+ "epoch": 1.0094043887147335,
+ "grad_norm": 0.4871070086956024,
+ "learning_rate": 4.884180398507163e-06,
+ "loss": 0.503,
+ "step": 214
+ },
+ {
+ "epoch": 1.0141065830721003,
+ "grad_norm": 0.45102012157440186,
+ "learning_rate": 4.882155748997636e-06,
+ "loss": 0.4954,
+ "step": 215
+ },
+ {
+ "epoch": 1.0188087774294672,
+ "grad_norm": 0.49910685420036316,
+ "learning_rate": 4.8801139830097685e-06,
+ "loss": 0.5019,
+ "step": 216
+ },
+ {
+ "epoch": 1.0235109717868338,
+ "grad_norm": 0.5155763030052185,
+ "learning_rate": 4.878055115214238e-06,
+ "loss": 0.5102,
+ "step": 217
+ },
+ {
+ "epoch": 1.0282131661442007,
+ "grad_norm": 0.4567059874534607,
+ "learning_rate": 4.875979160404607e-06,
+ "loss": 0.5069,
+ "step": 218
+ },
+ {
+ "epoch": 1.0329153605015673,
+ "grad_norm": 0.4782896935939789,
+ "learning_rate": 4.873886133497209e-06,
+ "loss": 0.5182,
+ "step": 219
+ },
+ {
+ "epoch": 1.0376175548589341,
+ "grad_norm": 0.44995731115341187,
+ "learning_rate": 4.87177604953105e-06,
+ "loss": 0.513,
+ "step": 220
+ },
+ {
+ "epoch": 1.042319749216301,
+ "grad_norm": 0.470059871673584,
+ "learning_rate": 4.869648923667694e-06,
+ "loss": 0.468,
+ "step": 221
+ },
+ {
+ "epoch": 1.0470219435736676,
+ "grad_norm": 0.5356128215789795,
+ "learning_rate": 4.867504771191154e-06,
+ "loss": 0.4942,
+ "step": 222
+ },
+ {
+ "epoch": 1.0517241379310345,
+ "grad_norm": 0.5137870907783508,
+ "learning_rate": 4.865343607507788e-06,
+ "loss": 0.5022,
+ "step": 223
+ },
+ {
+ "epoch": 1.0564263322884013,
+ "grad_norm": 0.47419992089271545,
+ "learning_rate": 4.86316544814618e-06,
+ "loss": 0.5158,
+ "step": 224
+ },
+ {
+ "epoch": 1.061128526645768,
+ "grad_norm": 0.49087393283843994,
+ "learning_rate": 4.860970308757038e-06,
+ "loss": 0.4605,
+ "step": 225
+ },
+ {
+ "epoch": 1.0658307210031348,
+ "grad_norm": 0.4988348186016083,
+ "learning_rate": 4.858758205113072e-06,
+ "loss": 0.4912,
+ "step": 226
+ },
+ {
+ "epoch": 1.0705329153605017,
+ "grad_norm": 0.44543248414993286,
+ "learning_rate": 4.856529153108888e-06,
+ "loss": 0.524,
+ "step": 227
+ },
+ {
+ "epoch": 1.0752351097178683,
+ "grad_norm": 0.5953351259231567,
+ "learning_rate": 4.854283168760868e-06,
+ "loss": 0.5001,
+ "step": 228
+ },
+ {
+ "epoch": 1.0799373040752351,
+ "grad_norm": 0.5012004375457764,
+ "learning_rate": 4.85202026820706e-06,
+ "loss": 0.4968,
+ "step": 229
+ },
+ {
+ "epoch": 1.084639498432602,
+ "grad_norm": 0.5023937821388245,
+ "learning_rate": 4.84974046770706e-06,
+ "loss": 0.5345,
+ "step": 230
+ },
+ {
+ "epoch": 1.0893416927899686,
+ "grad_norm": 0.4705684185028076,
+ "learning_rate": 4.847443783641893e-06,
+ "loss": 0.4459,
+ "step": 231
+ },
+ {
+ "epoch": 1.0940438871473355,
+ "grad_norm": 0.5082476735115051,
+ "learning_rate": 4.845130232513901e-06,
+ "loss": 0.4905,
+ "step": 232
+ },
+ {
+ "epoch": 1.098746081504702,
+ "grad_norm": 0.5283995866775513,
+ "learning_rate": 4.842799830946615e-06,
+ "loss": 0.4878,
+ "step": 233
+ },
+ {
+ "epoch": 1.103448275862069,
+ "grad_norm": 0.6373623013496399,
+ "learning_rate": 4.840452595684646e-06,
+ "loss": 0.4867,
+ "step": 234
+ },
+ {
+ "epoch": 1.1081504702194358,
+ "grad_norm": 0.4624481201171875,
+ "learning_rate": 4.83808854359356e-06,
+ "loss": 0.4793,
+ "step": 235
+ },
+ {
+ "epoch": 1.1128526645768024,
+ "grad_norm": 0.4659098982810974,
+ "learning_rate": 4.835707691659753e-06,
+ "loss": 0.4827,
+ "step": 236
+ },
+ {
+ "epoch": 1.1175548589341693,
+ "grad_norm": 0.4920850396156311,
+ "learning_rate": 4.8333100569903365e-06,
+ "loss": 0.4932,
+ "step": 237
+ },
+ {
+ "epoch": 1.1222570532915361,
+ "grad_norm": 0.492286741733551,
+ "learning_rate": 4.8308956568130094e-06,
+ "loss": 0.5144,
+ "step": 238
+ },
+ {
+ "epoch": 1.1269592476489028,
+ "grad_norm": 0.5429807901382446,
+ "learning_rate": 4.828464508475934e-06,
+ "loss": 0.5054,
+ "step": 239
+ },
+ {
+ "epoch": 1.1316614420062696,
+ "grad_norm": 2.4671998023986816,
+ "learning_rate": 4.826016629447616e-06,
+ "loss": 0.5073,
+ "step": 240
+ },
+ {
+ "epoch": 1.1363636363636362,
+ "grad_norm": 0.4593118131160736,
+ "learning_rate": 4.823552037316775e-06,
+ "loss": 0.4856,
+ "step": 241
+ },
+ {
+ "epoch": 1.141065830721003,
+ "grad_norm": 0.6855646371841431,
+ "learning_rate": 4.821070749792218e-06,
+ "loss": 0.5388,
+ "step": 242
+ },
+ {
+ "epoch": 1.14576802507837,
+ "grad_norm": 0.5722374320030212,
+ "learning_rate": 4.818572784702713e-06,
+ "loss": 0.51,
+ "step": 243
+ },
+ {
+ "epoch": 1.1504702194357366,
+ "grad_norm": 0.4901357591152191,
+ "learning_rate": 4.816058159996863e-06,
+ "loss": 0.5201,
+ "step": 244
+ },
+ {
+ "epoch": 1.1551724137931034,
+ "grad_norm": 0.4655209481716156,
+ "learning_rate": 4.813526893742972e-06,
+ "loss": 0.501,
+ "step": 245
+ },
+ {
+ "epoch": 1.1598746081504703,
+ "grad_norm": 0.7608394622802734,
+ "learning_rate": 4.810979004128924e-06,
+ "loss": 0.4961,
+ "step": 246
+ },
+ {
+ "epoch": 1.164576802507837,
+ "grad_norm": 0.4857081472873688,
+ "learning_rate": 4.808414509462042e-06,
+ "loss": 0.5174,
+ "step": 247
+ },
+ {
+ "epoch": 1.1692789968652038,
+ "grad_norm": 0.46672946214675903,
+ "learning_rate": 4.80583342816896e-06,
+ "loss": 0.484,
+ "step": 248
+ },
+ {
+ "epoch": 1.1739811912225706,
+ "grad_norm": 0.46982088685035706,
+ "learning_rate": 4.803235778795496e-06,
+ "loss": 0.5236,
+ "step": 249
+ },
+ {
+ "epoch": 1.1786833855799372,
+ "grad_norm": 0.5086098909378052,
+ "learning_rate": 4.800621580006511e-06,
+ "loss": 0.4673,
+ "step": 250
+ },
+ {
+ "epoch": 1.183385579937304,
+ "grad_norm": 0.45968860387802124,
+ "learning_rate": 4.797990850585782e-06,
+ "loss": 0.5151,
+ "step": 251
+ },
+ {
+ "epoch": 1.188087774294671,
+ "grad_norm": 0.49544984102249146,
+ "learning_rate": 4.79534360943586e-06,
+ "loss": 0.494,
+ "step": 252
+ },
+ {
+ "epoch": 1.1927899686520376,
+ "grad_norm": 0.531892716884613,
+ "learning_rate": 4.792679875577937e-06,
+ "loss": 0.4778,
+ "step": 253
+ },
+ {
+ "epoch": 1.1974921630094044,
+ "grad_norm": 0.5013542175292969,
+ "learning_rate": 4.789999668151714e-06,
+ "loss": 0.5132,
+ "step": 254
+ },
+ {
+ "epoch": 1.2021943573667713,
+ "grad_norm": 0.46963250637054443,
+ "learning_rate": 4.7873030064152545e-06,
+ "loss": 0.4938,
+ "step": 255
+ },
+ {
+ "epoch": 1.206896551724138,
+ "grad_norm": 0.465285986661911,
+ "learning_rate": 4.784589909744856e-06,
+ "loss": 0.4898,
+ "step": 256
+ },
+ {
+ "epoch": 1.2115987460815048,
+ "grad_norm": 0.5183936357498169,
+ "learning_rate": 4.7818603976349005e-06,
+ "loss": 0.5004,
+ "step": 257
+ },
+ {
+ "epoch": 1.2163009404388714,
+ "grad_norm": 0.47324836254119873,
+ "learning_rate": 4.779114489697724e-06,
+ "loss": 0.4972,
+ "step": 258
+ },
+ {
+ "epoch": 1.2210031347962382,
+ "grad_norm": 0.5208264589309692,
+ "learning_rate": 4.776352205663469e-06,
+ "loss": 0.5023,
+ "step": 259
+ },
+ {
+ "epoch": 1.225705329153605,
+ "grad_norm": 0.5583804845809937,
+ "learning_rate": 4.773573565379947e-06,
+ "loss": 0.5099,
+ "step": 260
+ },
+ {
+ "epoch": 1.2304075235109717,
+ "grad_norm": 0.5016160011291504,
+ "learning_rate": 4.770778588812489e-06,
+ "loss": 0.4765,
+ "step": 261
+ },
+ {
+ "epoch": 1.2351097178683386,
+ "grad_norm": 0.50210040807724,
+ "learning_rate": 4.7679672960438135e-06,
+ "loss": 0.5029,
+ "step": 262
+ },
+ {
+ "epoch": 1.2398119122257054,
+ "grad_norm": 0.6636150479316711,
+ "learning_rate": 4.765139707273872e-06,
+ "loss": 0.4909,
+ "step": 263
+ },
+ {
+ "epoch": 1.244514106583072,
+ "grad_norm": 0.4798625111579895,
+ "learning_rate": 4.762295842819707e-06,
+ "loss": 0.5012,
+ "step": 264
+ },
+ {
+ "epoch": 1.249216300940439,
+ "grad_norm": 0.5282374024391174,
+ "learning_rate": 4.759435723115308e-06,
+ "loss": 0.4681,
+ "step": 265
+ },
+ {
+ "epoch": 1.2539184952978055,
+ "grad_norm": 0.5356930494308472,
+ "learning_rate": 4.756559368711463e-06,
+ "loss": 0.506,
+ "step": 266
+ },
+ {
+ "epoch": 1.2586206896551724,
+ "grad_norm": 0.4857093095779419,
+ "learning_rate": 4.75366680027561e-06,
+ "loss": 0.4889,
+ "step": 267
+ },
+ {
+ "epoch": 1.2633228840125392,
+ "grad_norm": 0.484018474817276,
+ "learning_rate": 4.7507580385916906e-06,
+ "loss": 0.4899,
+ "step": 268
+ },
+ {
+ "epoch": 1.2680250783699059,
+ "grad_norm": 0.49720871448516846,
+ "learning_rate": 4.747833104559999e-06,
+ "loss": 0.4654,
+ "step": 269
+ },
+ {
+ "epoch": 1.2727272727272727,
+ "grad_norm": 0.4631911516189575,
+ "learning_rate": 4.744892019197033e-06,
+ "loss": 0.4796,
+ "step": 270
+ },
+ {
+ "epoch": 1.2774294670846396,
+ "grad_norm": 0.5116872787475586,
+ "learning_rate": 4.74193480363534e-06,
+ "loss": 0.4883,
+ "step": 271
+ },
+ {
+ "epoch": 1.2821316614420062,
+ "grad_norm": 0.5275093913078308,
+ "learning_rate": 4.738961479123373e-06,
+ "loss": 0.496,
+ "step": 272
+ },
+ {
+ "epoch": 1.286833855799373,
+ "grad_norm": 0.5001885890960693,
+ "learning_rate": 4.735972067025326e-06,
+ "loss": 0.5012,
+ "step": 273
+ },
+ {
+ "epoch": 1.29153605015674,
+ "grad_norm": 0.5875861048698425,
+ "learning_rate": 4.732966588820991e-06,
+ "loss": 0.4951,
+ "step": 274
+ },
+ {
+ "epoch": 1.2962382445141065,
+ "grad_norm": 0.4893011748790741,
+ "learning_rate": 4.729945066105599e-06,
+ "loss": 0.4742,
+ "step": 275
+ },
+ {
+ "epoch": 1.3009404388714734,
+ "grad_norm": 0.4648543894290924,
+ "learning_rate": 4.726907520589664e-06,
+ "loss": 0.466,
+ "step": 276
+ },
+ {
+ "epoch": 1.3056426332288402,
+ "grad_norm": 0.5300162434577942,
+ "learning_rate": 4.72385397409883e-06,
+ "loss": 0.5072,
+ "step": 277
+ },
+ {
+ "epoch": 1.3103448275862069,
+ "grad_norm": 0.4667080044746399,
+ "learning_rate": 4.720784448573712e-06,
+ "loss": 0.4986,
+ "step": 278
+ },
+ {
+ "epoch": 1.3150470219435737,
+ "grad_norm": 0.5278895497322083,
+ "learning_rate": 4.717698966069739e-06,
+ "loss": 0.5269,
+ "step": 279
+ },
+ {
+ "epoch": 1.3197492163009406,
+ "grad_norm": 0.5325866937637329,
+ "learning_rate": 4.7145975487569965e-06,
+ "loss": 0.5074,
+ "step": 280
+ },
+ {
+ "epoch": 1.3244514106583072,
+ "grad_norm": 0.500861644744873,
+ "learning_rate": 4.711480218920064e-06,
+ "loss": 0.4695,
+ "step": 281
+ },
+ {
+ "epoch": 1.329153605015674,
+ "grad_norm": 0.5263222455978394,
+ "learning_rate": 4.708346998957859e-06,
+ "loss": 0.5173,
+ "step": 282
+ },
+ {
+ "epoch": 1.3338557993730409,
+ "grad_norm": 0.622900128364563,
+ "learning_rate": 4.705197911383473e-06,
+ "loss": 0.4905,
+ "step": 283
+ },
+ {
+ "epoch": 1.3385579937304075,
+ "grad_norm": 0.49273768067359924,
+ "learning_rate": 4.7020329788240115e-06,
+ "loss": 0.4743,
+ "step": 284
+ },
+ {
+ "epoch": 1.3432601880877744,
+ "grad_norm": 0.49558964371681213,
+ "learning_rate": 4.6988522240204325e-06,
+ "loss": 0.4824,
+ "step": 285
+ },
+ {
+ "epoch": 1.347962382445141,
+ "grad_norm": 0.4743976891040802,
+ "learning_rate": 4.695655669827377e-06,
+ "loss": 0.4977,
+ "step": 286
+ },
+ {
+ "epoch": 1.3526645768025078,
+ "grad_norm": 0.49542659521102905,
+ "learning_rate": 4.6924433392130135e-06,
+ "loss": 0.4924,
+ "step": 287
+ },
+ {
+ "epoch": 1.3573667711598745,
+ "grad_norm": 0.7385990619659424,
+ "learning_rate": 4.689215255258866e-06,
+ "loss": 0.5091,
+ "step": 288
+ },
+ {
+ "epoch": 1.3620689655172413,
+ "grad_norm": 0.4826123118400574,
+ "learning_rate": 4.685971441159653e-06,
+ "loss": 0.4791,
+ "step": 289
+ },
+ {
+ "epoch": 1.3667711598746082,
+ "grad_norm": 0.5389033555984497,
+ "learning_rate": 4.682711920223115e-06,
+ "loss": 0.4751,
+ "step": 290
+ },
+ {
+ "epoch": 1.3714733542319748,
+ "grad_norm": 0.5059546232223511,
+ "learning_rate": 4.679436715869856e-06,
+ "loss": 0.499,
+ "step": 291
+ },
+ {
+ "epoch": 1.3761755485893417,
+ "grad_norm": 0.5682849884033203,
+ "learning_rate": 4.676145851633166e-06,
+ "loss": 0.5143,
+ "step": 292
+ },
+ {
+ "epoch": 1.3808777429467085,
+ "grad_norm": 0.4754337668418884,
+ "learning_rate": 4.672839351158856e-06,
+ "loss": 0.4997,
+ "step": 293
+ },
+ {
+ "epoch": 1.3855799373040751,
+ "grad_norm": 0.5227643847465515,
+ "learning_rate": 4.669517238205089e-06,
+ "loss": 0.4834,
+ "step": 294
+ },
+ {
+ "epoch": 1.390282131661442,
+ "grad_norm": 0.4954044222831726,
+ "learning_rate": 4.666179536642208e-06,
+ "loss": 0.483,
+ "step": 295
+ },
+ {
+ "epoch": 1.3949843260188088,
+ "grad_norm": 0.4909021556377411,
+ "learning_rate": 4.662826270452565e-06,
+ "loss": 0.4808,
+ "step": 296
+ },
+ {
+ "epoch": 1.3996865203761755,
+ "grad_norm": 0.4666971266269684,
+ "learning_rate": 4.659457463730347e-06,
+ "loss": 0.488,
+ "step": 297
+ },
+ {
+ "epoch": 1.4043887147335423,
+ "grad_norm": 0.5064187049865723,
+ "learning_rate": 4.6560731406814056e-06,
+ "loss": 0.5046,
+ "step": 298
+ },
+ {
+ "epoch": 1.4090909090909092,
+ "grad_norm": 0.4958318769931793,
+ "learning_rate": 4.65267332562308e-06,
+ "loss": 0.5102,
+ "step": 299
+ },
+ {
+ "epoch": 1.4137931034482758,
+ "grad_norm": 0.5080632567405701,
+ "learning_rate": 4.649258042984026e-06,
+ "loss": 0.5055,
+ "step": 300
+ },
+ {
+ "epoch": 1.4184952978056427,
+ "grad_norm": 0.46236541867256165,
+ "learning_rate": 4.6458273173040395e-06,
+ "loss": 0.4606,
+ "step": 301
+ },
+ {
+ "epoch": 1.4231974921630095,
+ "grad_norm": 1.8524898290634155,
+ "learning_rate": 4.642381173233874e-06,
+ "loss": 0.5002,
+ "step": 302
+ },
+ {
+ "epoch": 1.4278996865203761,
+ "grad_norm": 0.5202615261077881,
+ "learning_rate": 4.638919635535073e-06,
+ "loss": 0.4562,
+ "step": 303
+ },
+ {
+ "epoch": 1.432601880877743,
+ "grad_norm": 0.5293647050857544,
+ "learning_rate": 4.635442729079788e-06,
+ "loss": 0.4806,
+ "step": 304
+ },
+ {
+ "epoch": 1.4373040752351098,
+ "grad_norm": 0.5165356993675232,
+ "learning_rate": 4.6319504788505956e-06,
+ "loss": 0.4775,
+ "step": 305
+ },
+ {
+ "epoch": 1.4420062695924765,
+ "grad_norm": 0.5092841386795044,
+ "learning_rate": 4.628442909940325e-06,
+ "loss": 0.4892,
+ "step": 306
+ },
+ {
+ "epoch": 1.4467084639498433,
+ "grad_norm": 0.511424720287323,
+ "learning_rate": 4.624920047551874e-06,
+ "loss": 0.506,
+ "step": 307
+ },
+ {
+ "epoch": 1.4514106583072102,
+ "grad_norm": 0.5631566643714905,
+ "learning_rate": 4.621381916998029e-06,
+ "loss": 0.4741,
+ "step": 308
+ },
+ {
+ "epoch": 1.4561128526645768,
+ "grad_norm": 0.4748315215110779,
+ "learning_rate": 4.6178285437012806e-06,
+ "loss": 0.5084,
+ "step": 309
+ },
+ {
+ "epoch": 1.4608150470219436,
+ "grad_norm": 0.47158119082450867,
+ "learning_rate": 4.6142599531936435e-06,
+ "loss": 0.4697,
+ "step": 310
+ },
+ {
+ "epoch": 1.4655172413793103,
+ "grad_norm": 0.5358107089996338,
+ "learning_rate": 4.610676171116475e-06,
+ "loss": 0.491,
+ "step": 311
+ },
+ {
+ "epoch": 1.4702194357366771,
+ "grad_norm": 0.47717440128326416,
+ "learning_rate": 4.607077223220286e-06,
+ "loss": 0.4948,
+ "step": 312
+ },
+ {
+ "epoch": 1.4749216300940438,
+ "grad_norm": 0.5041193962097168,
+ "learning_rate": 4.603463135364556e-06,
+ "loss": 0.4648,
+ "step": 313
+ },
+ {
+ "epoch": 1.4796238244514106,
+ "grad_norm": 0.9311274290084839,
+ "learning_rate": 4.5998339335175555e-06,
+ "loss": 0.4866,
+ "step": 314
+ },
+ {
+ "epoch": 1.4843260188087775,
+ "grad_norm": 0.47408604621887207,
+ "learning_rate": 4.596189643756147e-06,
+ "loss": 0.4634,
+ "step": 315
+ },
+ {
+ "epoch": 1.489028213166144,
+ "grad_norm": 0.5052632093429565,
+ "learning_rate": 4.592530292265609e-06,
+ "loss": 0.4843,
+ "step": 316
+ },
+ {
+ "epoch": 1.493730407523511,
+ "grad_norm": 0.5100846886634827,
+ "learning_rate": 4.58885590533944e-06,
+ "loss": 0.4942,
+ "step": 317
+ },
+ {
+ "epoch": 1.4984326018808778,
+ "grad_norm": 0.5132214426994324,
+ "learning_rate": 4.585166509379173e-06,
+ "loss": 0.5135,
+ "step": 318
+ },
+ {
+ "epoch": 1.5031347962382444,
+ "grad_norm": 11.112855911254883,
+ "learning_rate": 4.581462130894186e-06,
+ "loss": 0.4933,
+ "step": 319
+ },
+ {
+ "epoch": 1.5078369905956113,
+ "grad_norm": 0.4873805642127991,
+ "learning_rate": 4.57774279650151e-06,
+ "loss": 0.483,
+ "step": 320
+ },
+ {
+ "epoch": 1.5125391849529781,
+ "grad_norm": 0.5026459693908691,
+ "learning_rate": 4.574008532925638e-06,
+ "loss": 0.5075,
+ "step": 321
+ },
+ {
+ "epoch": 1.5172413793103448,
+ "grad_norm": 0.489947110414505,
+ "learning_rate": 4.570259366998336e-06,
+ "loss": 0.4954,
+ "step": 322
+ },
+ {
+ "epoch": 1.5219435736677116,
+ "grad_norm": 0.48120853304862976,
+ "learning_rate": 4.566495325658445e-06,
+ "loss": 0.5221,
+ "step": 323
+ },
+ {
+ "epoch": 1.5266457680250785,
+ "grad_norm": 0.4880066514015198,
+ "learning_rate": 4.5627164359516915e-06,
+ "loss": 0.5031,
+ "step": 324
+ },
+ {
+ "epoch": 1.531347962382445,
+ "grad_norm": 0.5048410892486572,
+ "learning_rate": 4.558922725030491e-06,
+ "loss": 0.4757,
+ "step": 325
+ },
+ {
+ "epoch": 1.536050156739812,
+ "grad_norm": 0.7033756375312805,
+ "learning_rate": 4.555114220153755e-06,
+ "loss": 0.4285,
+ "step": 326
+ },
+ {
+ "epoch": 1.5407523510971788,
+ "grad_norm": 0.4716516435146332,
+ "learning_rate": 4.551290948686693e-06,
+ "loss": 0.5121,
+ "step": 327
+ },
+ {
+ "epoch": 1.5454545454545454,
+ "grad_norm": 0.4782696068286896,
+ "learning_rate": 4.547452938100615e-06,
+ "loss": 0.5176,
+ "step": 328
+ },
+ {
+ "epoch": 1.5501567398119123,
+ "grad_norm": 0.5119273066520691,
+ "learning_rate": 4.54360021597274e-06,
+ "loss": 0.4941,
+ "step": 329
+ },
+ {
+ "epoch": 1.5548589341692791,
+ "grad_norm": 0.5010069608688354,
+ "learning_rate": 4.539732809985989e-06,
+ "loss": 0.4862,
+ "step": 330
+ },
+ {
+ "epoch": 1.5595611285266457,
+ "grad_norm": 0.5129932165145874,
+ "learning_rate": 4.535850747928796e-06,
+ "loss": 0.4978,
+ "step": 331
+ },
+ {
+ "epoch": 1.5642633228840124,
+ "grad_norm": 0.4957594573497772,
+ "learning_rate": 4.531954057694897e-06,
+ "loss": 0.4814,
+ "step": 332
+ },
+ {
+ "epoch": 1.5689655172413794,
+ "grad_norm": 0.5642824172973633,
+ "learning_rate": 4.5280427672831414e-06,
+ "loss": 0.4888,
+ "step": 333
+ },
+ {
+ "epoch": 1.573667711598746,
+ "grad_norm": 0.4562854468822479,
+ "learning_rate": 4.524116904797281e-06,
+ "loss": 0.4648,
+ "step": 334
+ },
+ {
+ "epoch": 1.5783699059561127,
+ "grad_norm": 0.4849218428134918,
+ "learning_rate": 4.520176498445774e-06,
+ "loss": 0.476,
+ "step": 335
+ },
+ {
+ "epoch": 1.5830721003134798,
+ "grad_norm": 0.5046947002410889,
+ "learning_rate": 4.516221576541581e-06,
+ "loss": 0.4776,
+ "step": 336
+ },
+ {
+ "epoch": 1.5877742946708464,
+ "grad_norm": 0.48211777210235596,
+ "learning_rate": 4.512252167501959e-06,
+ "loss": 0.479,
+ "step": 337
+ },
+ {
+ "epoch": 1.592476489028213,
+ "grad_norm": 0.4812171459197998,
+ "learning_rate": 4.508268299848262e-06,
+ "loss": 0.4849,
+ "step": 338
+ },
+ {
+ "epoch": 1.59717868338558,
+ "grad_norm": 0.5865142345428467,
+ "learning_rate": 4.50427000220573e-06,
+ "loss": 0.499,
+ "step": 339
+ },
+ {
+ "epoch": 1.6018808777429467,
+ "grad_norm": 0.49277785420417786,
+ "learning_rate": 4.50025730330329e-06,
+ "loss": 0.475,
+ "step": 340
+ },
+ {
+ "epoch": 1.6065830721003134,
+ "grad_norm": 0.46771496534347534,
+ "learning_rate": 4.4962302319733445e-06,
+ "loss": 0.494,
+ "step": 341
+ },
+ {
+ "epoch": 1.6112852664576802,
+ "grad_norm": 0.5189441442489624,
+ "learning_rate": 4.492188817151565e-06,
+ "loss": 0.5275,
+ "step": 342
+ },
+ {
+ "epoch": 1.615987460815047,
+ "grad_norm": 0.48845574259757996,
+ "learning_rate": 4.488133087876688e-06,
+ "loss": 0.4676,
+ "step": 343
+ },
+ {
+ "epoch": 1.6206896551724137,
+ "grad_norm": 0.47189632058143616,
+ "learning_rate": 4.484063073290301e-06,
+ "loss": 0.4642,
+ "step": 344
+ },
+ {
+ "epoch": 1.6253918495297806,
+ "grad_norm": 0.5442587733268738,
+ "learning_rate": 4.479978802636637e-06,
+ "loss": 0.4981,
+ "step": 345
+ },
+ {
+ "epoch": 1.6300940438871474,
+ "grad_norm": 0.5048685073852539,
+ "learning_rate": 4.475880305262362e-06,
+ "loss": 0.5037,
+ "step": 346
+ },
+ {
+ "epoch": 1.634796238244514,
+ "grad_norm": 0.4781409800052643,
+ "learning_rate": 4.471767610616366e-06,
+ "loss": 0.4932,
+ "step": 347
+ },
+ {
+ "epoch": 1.6394984326018809,
+ "grad_norm": 0.47388938069343567,
+ "learning_rate": 4.467640748249549e-06,
+ "loss": 0.4687,
+ "step": 348
+ },
+ {
+ "epoch": 1.6442006269592477,
+ "grad_norm": 0.529712438583374,
+ "learning_rate": 4.4634997478146125e-06,
+ "loss": 0.487,
+ "step": 349
+ },
+ {
+ "epoch": 1.6489028213166144,
+ "grad_norm": 0.5114791393280029,
+ "learning_rate": 4.459344639065842e-06,
+ "loss": 0.4809,
+ "step": 350
+ },
+ {
+ "epoch": 1.6536050156739812,
+ "grad_norm": 0.45415258407592773,
+ "learning_rate": 4.455175451858897e-06,
+ "loss": 0.4901,
+ "step": 351
+ },
+ {
+ "epoch": 1.658307210031348,
+ "grad_norm": 0.5842339396476746,
+ "learning_rate": 4.450992216150592e-06,
+ "loss": 0.499,
+ "step": 352
+ },
+ {
+ "epoch": 1.6630094043887147,
+ "grad_norm": 0.48795560002326965,
+ "learning_rate": 4.446794961998689e-06,
+ "loss": 0.4659,
+ "step": 353
+ },
+ {
+ "epoch": 1.6677115987460815,
+ "grad_norm": 0.5531855225563049,
+ "learning_rate": 4.442583719561671e-06,
+ "loss": 0.4923,
+ "step": 354
+ },
+ {
+ "epoch": 1.6724137931034484,
+ "grad_norm": 0.5827644467353821,
+ "learning_rate": 4.438358519098536e-06,
+ "loss": 0.4991,
+ "step": 355
+ },
+ {
+ "epoch": 1.677115987460815,
+ "grad_norm": 0.5260423421859741,
+ "learning_rate": 4.4341193909685685e-06,
+ "loss": 0.4843,
+ "step": 356
+ },
+ {
+ "epoch": 1.6818181818181817,
+ "grad_norm": 0.4969344437122345,
+ "learning_rate": 4.429866365631134e-06,
+ "loss": 0.4915,
+ "step": 357
+ },
+ {
+ "epoch": 1.6865203761755487,
+ "grad_norm": 0.4725005030632019,
+ "learning_rate": 4.425599473645447e-06,
+ "loss": 0.4804,
+ "step": 358
+ },
+ {
+ "epoch": 1.6912225705329154,
+ "grad_norm": 0.47171467542648315,
+ "learning_rate": 4.421318745670364e-06,
+ "loss": 0.4823,
+ "step": 359
+ },
+ {
+ "epoch": 1.695924764890282,
+ "grad_norm": 0.4839799106121063,
+ "learning_rate": 4.4170242124641524e-06,
+ "loss": 0.4585,
+ "step": 360
+ },
+ {
+ "epoch": 1.700626959247649,
+ "grad_norm": 0.4786856472492218,
+ "learning_rate": 4.412715904884277e-06,
+ "loss": 0.49,
+ "step": 361
+ },
+ {
+ "epoch": 1.7053291536050157,
+ "grad_norm": 0.49980080127716064,
+ "learning_rate": 4.4083938538871735e-06,
+ "loss": 0.4675,
+ "step": 362
+ },
+ {
+ "epoch": 1.7100313479623823,
+ "grad_norm": 0.5201369524002075,
+ "learning_rate": 4.4040580905280295e-06,
+ "loss": 0.4862,
+ "step": 363
+ },
+ {
+ "epoch": 1.7147335423197492,
+ "grad_norm": 0.7051575183868408,
+ "learning_rate": 4.3997086459605586e-06,
+ "loss": 0.4822,
+ "step": 364
+ },
+ {
+ "epoch": 1.719435736677116,
+ "grad_norm": 0.48206666111946106,
+ "learning_rate": 4.395345551436779e-06,
+ "loss": 0.5076,
+ "step": 365
+ },
+ {
+ "epoch": 1.7241379310344827,
+ "grad_norm": 0.4817257821559906,
+ "learning_rate": 4.390968838306788e-06,
+ "loss": 0.4623,
+ "step": 366
+ },
+ {
+ "epoch": 1.7288401253918495,
+ "grad_norm": 0.5547840595245361,
+ "learning_rate": 4.386578538018535e-06,
+ "loss": 0.461,
+ "step": 367
+ },
+ {
+ "epoch": 1.7335423197492164,
+ "grad_norm": 0.5085346698760986,
+ "learning_rate": 4.382174682117598e-06,
+ "loss": 0.5068,
+ "step": 368
+ },
+ {
+ "epoch": 1.738244514106583,
+ "grad_norm": 0.4870692193508148,
+ "learning_rate": 4.377757302246956e-06,
+ "loss": 0.4403,
+ "step": 369
+ },
+ {
+ "epoch": 1.7429467084639498,
+ "grad_norm": 0.49482715129852295,
+ "learning_rate": 4.373326430146762e-06,
+ "loss": 0.4986,
+ "step": 370
+ },
+ {
+ "epoch": 1.7476489028213167,
+ "grad_norm": 0.5474854707717896,
+ "learning_rate": 4.368882097654113e-06,
+ "loss": 0.4938,
+ "step": 371
+ },
+ {
+ "epoch": 1.7523510971786833,
+ "grad_norm": 0.5055244565010071,
+ "learning_rate": 4.364424336702825e-06,
+ "loss": 0.4711,
+ "step": 372
+ },
+ {
+ "epoch": 1.7570532915360502,
+ "grad_norm": 0.48241329193115234,
+ "learning_rate": 4.3599531793232e-06,
+ "loss": 0.4856,
+ "step": 373
+ },
+ {
+ "epoch": 1.761755485893417,
+ "grad_norm": 0.4932602047920227,
+ "learning_rate": 4.355468657641797e-06,
+ "loss": 0.4818,
+ "step": 374
+ },
+ {
+ "epoch": 1.7664576802507836,
+ "grad_norm": 0.5512160658836365,
+ "learning_rate": 4.3509708038812035e-06,
+ "loss": 0.4864,
+ "step": 375
+ },
+ {
+ "epoch": 1.7711598746081505,
+ "grad_norm": 0.47026327252388,
+ "learning_rate": 4.346459650359798e-06,
+ "loss": 0.4825,
+ "step": 376
+ },
+ {
+ "epoch": 1.7758620689655173,
+ "grad_norm": 0.4831086993217468,
+ "learning_rate": 4.341935229491525e-06,
+ "loss": 0.4541,
+ "step": 377
+ },
+ {
+ "epoch": 1.780564263322884,
+ "grad_norm": 0.5045217871665955,
+ "learning_rate": 4.337397573785659e-06,
+ "loss": 0.5025,
+ "step": 378
+ },
+ {
+ "epoch": 1.7852664576802508,
+ "grad_norm": 0.5657753348350525,
+ "learning_rate": 4.332846715846566e-06,
+ "loss": 0.4698,
+ "step": 379
+ },
+ {
+ "epoch": 1.7899686520376177,
+ "grad_norm": 0.49546748399734497,
+ "learning_rate": 4.328282688373479e-06,
+ "loss": 0.4911,
+ "step": 380
+ },
+ {
+ "epoch": 1.7946708463949843,
+ "grad_norm": 0.5037291049957275,
+ "learning_rate": 4.323705524160258e-06,
+ "loss": 0.4877,
+ "step": 381
+ },
+ {
+ "epoch": 1.799373040752351,
+ "grad_norm": 0.5256901383399963,
+ "learning_rate": 4.319115256095149e-06,
+ "loss": 0.4662,
+ "step": 382
+ },
+ {
+ "epoch": 1.804075235109718,
+ "grad_norm": 0.4890702962875366,
+ "learning_rate": 4.314511917160557e-06,
+ "loss": 0.4683,
+ "step": 383
+ },
+ {
+ "epoch": 1.8087774294670846,
+ "grad_norm": 0.4724109470844269,
+ "learning_rate": 4.3098955404328045e-06,
+ "loss": 0.4602,
+ "step": 384
+ },
+ {
+ "epoch": 1.8134796238244513,
+ "grad_norm": 0.4933278560638428,
+ "learning_rate": 4.305266159081895e-06,
+ "loss": 0.4806,
+ "step": 385
+ },
+ {
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.5068219304084778,
+ "learning_rate": 4.3006238063712725e-06,
+ "loss": 0.4647,
+ "step": 386
+ },
+ {
+ "epoch": 1.822884012539185,
+ "grad_norm": 0.5293509364128113,
+ "learning_rate": 4.295968515657583e-06,
+ "loss": 0.4998,
+ "step": 387
+ },
+ {
+ "epoch": 1.8275862068965516,
+ "grad_norm": 0.4775199294090271,
+ "learning_rate": 4.29130032039044e-06,
+ "loss": 0.4821,
+ "step": 388
+ },
+ {
+ "epoch": 1.8322884012539185,
+ "grad_norm": 0.4914006292819977,
+ "learning_rate": 4.2866192541121755e-06,
+ "loss": 0.4735,
+ "step": 389
+ },
+ {
+ "epoch": 1.8369905956112853,
+ "grad_norm": 0.5009908080101013,
+ "learning_rate": 4.281925350457606e-06,
+ "loss": 0.4741,
+ "step": 390
+ },
+ {
+ "epoch": 1.841692789968652,
+ "grad_norm": 0.47211164236068726,
+ "learning_rate": 4.277218643153787e-06,
+ "loss": 0.4786,
+ "step": 391
+ },
+ {
+ "epoch": 1.8463949843260188,
+ "grad_norm": 1.9644113779067993,
+ "learning_rate": 4.272499166019771e-06,
+ "loss": 0.4759,
+ "step": 392
+ },
+ {
+ "epoch": 1.8510971786833856,
+ "grad_norm": 0.535971999168396,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.4665,
+ "step": 393
+ },
+ {
+ "epoch": 1.8557993730407523,
+ "grad_norm": 0.4666787385940552,
+ "learning_rate": 4.2630220379959006e-06,
+ "loss": 0.4417,
+ "step": 394
+ },
+ {
+ "epoch": 1.8605015673981191,
+ "grad_norm": 0.5976264476776123,
+ "learning_rate": 4.258264455201953e-06,
+ "loss": 0.4665,
+ "step": 395
+ },
+ {
+ "epoch": 1.865203761755486,
+ "grad_norm": 0.4814331531524658,
+ "learning_rate": 4.2534942387691335e-06,
+ "loss": 0.4896,
+ "step": 396
+ },
+ {
+ "epoch": 1.8699059561128526,
+ "grad_norm": 0.4929859936237335,
+ "learning_rate": 4.248711422972829e-06,
+ "loss": 0.4765,
+ "step": 397
+ },
+ {
+ "epoch": 1.8746081504702194,
+ "grad_norm": 0.517914354801178,
+ "learning_rate": 4.243916042178954e-06,
+ "loss": 0.4601,
+ "step": 398
+ },
+ {
+ "epoch": 1.8793103448275863,
+ "grad_norm": 0.47731271386146545,
+ "learning_rate": 4.239108130843709e-06,
+ "loss": 0.469,
+ "step": 399
+ },
+ {
+ "epoch": 1.884012539184953,
+ "grad_norm": 0.4939954876899719,
+ "learning_rate": 4.234287723513326e-06,
+ "loss": 0.4929,
+ "step": 400
+ },
+ {
+ "epoch": 1.8887147335423198,
+ "grad_norm": 0.48573923110961914,
+ "learning_rate": 4.229454854823827e-06,
+ "loss": 0.4913,
+ "step": 401
+ },
+ {
+ "epoch": 1.8934169278996866,
+ "grad_norm": 0.5146409273147583,
+ "learning_rate": 4.224609559500772e-06,
+ "loss": 0.502,
+ "step": 402
+ },
+ {
+ "epoch": 1.8981191222570533,
+ "grad_norm": 0.4884675443172455,
+ "learning_rate": 4.21975187235901e-06,
+ "loss": 0.4541,
+ "step": 403
+ },
+ {
+ "epoch": 1.90282131661442,
+ "grad_norm": 0.4871810972690582,
+ "learning_rate": 4.21488182830243e-06,
+ "loss": 0.4811,
+ "step": 404
+ },
+ {
+ "epoch": 1.907523510971787,
+ "grad_norm": 0.5089552402496338,
+ "learning_rate": 4.209999462323706e-06,
+ "loss": 0.4584,
+ "step": 405
+ },
+ {
+ "epoch": 1.9122257053291536,
+ "grad_norm": 0.6191231608390808,
+ "learning_rate": 4.20510480950405e-06,
+ "loss": 0.4885,
+ "step": 406
+ },
+ {
+ "epoch": 1.9169278996865202,
+ "grad_norm": 0.5512096285820007,
+ "learning_rate": 4.200197905012961e-06,
+ "loss": 0.4529,
+ "step": 407
+ },
+ {
+ "epoch": 1.9216300940438873,
+ "grad_norm": 0.4743112027645111,
+ "learning_rate": 4.195278784107965e-06,
+ "loss": 0.4702,
+ "step": 408
+ },
+ {
+ "epoch": 1.926332288401254,
+ "grad_norm": 0.4635118544101715,
+ "learning_rate": 4.19034748213437e-06,
+ "loss": 0.4718,
+ "step": 409
+ },
+ {
+ "epoch": 1.9310344827586206,
+ "grad_norm": 0.48715919256210327,
+ "learning_rate": 4.185404034525008e-06,
+ "loss": 0.4638,
+ "step": 410
+ },
+ {
+ "epoch": 1.9357366771159876,
+ "grad_norm": 0.5373724102973938,
+ "learning_rate": 4.180448476799981e-06,
+ "loss": 0.5009,
+ "step": 411
+ },
+ {
+ "epoch": 1.9404388714733543,
+ "grad_norm": 0.4978715479373932,
+ "learning_rate": 4.175480844566404e-06,
+ "loss": 0.4726,
+ "step": 412
+ },
+ {
+ "epoch": 1.9451410658307209,
+ "grad_norm": 0.44817060232162476,
+ "learning_rate": 4.170501173518152e-06,
+ "loss": 0.4683,
+ "step": 413
+ },
+ {
+ "epoch": 1.9498432601880877,
+ "grad_norm": 0.48472973704338074,
+ "learning_rate": 4.165509499435604e-06,
+ "loss": 0.4662,
+ "step": 414
+ },
+ {
+ "epoch": 1.9545454545454546,
+ "grad_norm": 0.6567174792289734,
+ "learning_rate": 4.16050585818538e-06,
+ "loss": 0.4801,
+ "step": 415
+ },
+ {
+ "epoch": 1.9592476489028212,
+ "grad_norm": 0.5131425857543945,
+ "learning_rate": 4.155490285720092e-06,
+ "loss": 0.5036,
+ "step": 416
+ },
+ {
+ "epoch": 1.963949843260188,
+ "grad_norm": 0.46051982045173645,
+ "learning_rate": 4.150462818078079e-06,
+ "loss": 0.4911,
+ "step": 417
+ },
+ {
+ "epoch": 1.968652037617555,
+ "grad_norm": 0.5288883447647095,
+ "learning_rate": 4.145423491383153e-06,
+ "loss": 0.4871,
+ "step": 418
+ },
+ {
+ "epoch": 1.9733542319749215,
+ "grad_norm": 0.5143817663192749,
+ "learning_rate": 4.14037234184433e-06,
+ "loss": 0.5027,
+ "step": 419
+ },
+ {
+ "epoch": 1.9780564263322884,
+ "grad_norm": 0.46323707699775696,
+ "learning_rate": 4.135309405755583e-06,
+ "loss": 0.4876,
+ "step": 420
+ },
+ {
+ "epoch": 1.9827586206896552,
+ "grad_norm": 0.5239706039428711,
+ "learning_rate": 4.130234719495574e-06,
+ "loss": 0.4702,
+ "step": 421
+ },
+ {
+ "epoch": 1.9874608150470219,
+ "grad_norm": 0.538753867149353,
+ "learning_rate": 4.125148319527391e-06,
+ "loss": 0.4638,
+ "step": 422
+ },
+ {
+ "epoch": 1.9921630094043887,
+ "grad_norm": 0.5180181860923767,
+ "learning_rate": 4.1200502423982904e-06,
+ "loss": 0.4841,
+ "step": 423
+ },
+ {
+ "epoch": 1.9968652037617556,
+ "grad_norm": 0.6698167324066162,
+ "learning_rate": 4.1149405247394295e-06,
+ "loss": 0.4882,
+ "step": 424
+ },
+ {
+ "epoch": 2.0047021943573666,
+ "grad_norm": 0.9728522896766663,
+ "learning_rate": 4.10981920326561e-06,
+ "loss": 0.9125,
+ "step": 425
+ },
+ {
+ "epoch": 2.0094043887147337,
+ "grad_norm": 0.7356107831001282,
+ "learning_rate": 4.104686314775009e-06,
+ "loss": 0.4422,
+ "step": 426
+ },
+ {
+ "epoch": 2.0141065830721003,
+ "grad_norm": 0.44414228200912476,
+ "learning_rate": 4.099541896148914e-06,
+ "loss": 0.4511,
+ "step": 427
+ },
+ {
+ "epoch": 2.018808777429467,
+ "grad_norm": 0.5738011002540588,
+ "learning_rate": 4.094385984351462e-06,
+ "loss": 0.4457,
+ "step": 428
+ },
+ {
+ "epoch": 2.023510971786834,
+ "grad_norm": 0.4643106460571289,
+ "learning_rate": 4.0892186164293715e-06,
+ "loss": 0.4644,
+ "step": 429
+ },
+ {
+ "epoch": 2.0282131661442007,
+ "grad_norm": 0.5355309247970581,
+ "learning_rate": 4.0840398295116745e-06,
+ "loss": 0.4535,
+ "step": 430
+ },
+ {
+ "epoch": 2.0329153605015673,
+ "grad_norm": 0.512458324432373,
+ "learning_rate": 4.078849660809456e-06,
+ "loss": 0.4481,
+ "step": 431
+ },
+ {
+ "epoch": 2.0376175548589344,
+ "grad_norm": 0.5055253505706787,
+ "learning_rate": 4.073648147615579e-06,
+ "loss": 0.4309,
+ "step": 432
+ },
+ {
+ "epoch": 2.042319749216301,
+ "grad_norm": 0.5128353834152222,
+ "learning_rate": 4.068435327304421e-06,
+ "loss": 0.4562,
+ "step": 433
+ },
+ {
+ "epoch": 2.0470219435736676,
+ "grad_norm": 0.4432103633880615,
+ "learning_rate": 4.063211237331603e-06,
+ "loss": 0.4535,
+ "step": 434
+ },
+ {
+ "epoch": 2.0517241379310347,
+ "grad_norm": 0.5092498660087585,
+ "learning_rate": 4.057975915233725e-06,
+ "loss": 0.4385,
+ "step": 435
+ },
+ {
+ "epoch": 2.0564263322884013,
+ "grad_norm": 0.4798133671283722,
+ "learning_rate": 4.052729398628089e-06,
+ "loss": 0.466,
+ "step": 436
+ },
+ {
+ "epoch": 2.061128526645768,
+ "grad_norm": 0.5094019770622253,
+ "learning_rate": 4.047471725212437e-06,
+ "loss": 0.4624,
+ "step": 437
+ },
+ {
+ "epoch": 2.0658307210031346,
+ "grad_norm": 0.5814178586006165,
+ "learning_rate": 4.042202932764673e-06,
+ "loss": 0.4472,
+ "step": 438
+ },
+ {
+ "epoch": 2.0705329153605017,
+ "grad_norm": 0.503394365310669,
+ "learning_rate": 4.036923059142595e-06,
+ "loss": 0.4481,
+ "step": 439
+ },
+ {
+ "epoch": 2.0752351097178683,
+ "grad_norm": 0.5108861923217773,
+ "learning_rate": 4.031632142283623e-06,
+ "loss": 0.4416,
+ "step": 440
+ },
+ {
+ "epoch": 2.079937304075235,
+ "grad_norm": 0.5303971171379089,
+ "learning_rate": 4.026330220204524e-06,
+ "loss": 0.4515,
+ "step": 441
+ },
+ {
+ "epoch": 2.084639498432602,
+ "grad_norm": 0.45014286041259766,
+ "learning_rate": 4.021017331001146e-06,
+ "loss": 0.441,
+ "step": 442
+ },
+ {
+ "epoch": 2.0893416927899686,
+ "grad_norm": 0.5371219515800476,
+ "learning_rate": 4.015693512848131e-06,
+ "loss": 0.4471,
+ "step": 443
+ },
+ {
+ "epoch": 2.0940438871473352,
+ "grad_norm": 0.5105510354042053,
+ "learning_rate": 4.0103588039986556e-06,
+ "loss": 0.4534,
+ "step": 444
+ },
+ {
+ "epoch": 2.0987460815047023,
+ "grad_norm": 0.4960611164569855,
+ "learning_rate": 4.005013242784146e-06,
+ "loss": 0.46,
+ "step": 445
+ },
+ {
+ "epoch": 2.103448275862069,
+ "grad_norm": 0.500354528427124,
+ "learning_rate": 3.999656867614006e-06,
+ "loss": 0.45,
+ "step": 446
+ },
+ {
+ "epoch": 2.1081504702194356,
+ "grad_norm": 0.4733876585960388,
+ "learning_rate": 3.994289716975341e-06,
+ "loss": 0.4644,
+ "step": 447
+ },
+ {
+ "epoch": 2.1128526645768027,
+ "grad_norm": 0.5002915263175964,
+ "learning_rate": 3.988911829432682e-06,
+ "loss": 0.4493,
+ "step": 448
+ },
+ {
+ "epoch": 2.1175548589341693,
+ "grad_norm": 0.48520293831825256,
+ "learning_rate": 3.983523243627706e-06,
+ "loss": 0.4458,
+ "step": 449
+ },
+ {
+ "epoch": 2.122257053291536,
+ "grad_norm": 0.6339934468269348,
+ "learning_rate": 3.978123998278962e-06,
+ "loss": 0.4352,
+ "step": 450
+ },
+ {
+ "epoch": 2.126959247648903,
+ "grad_norm": 1.172338843345642,
+ "learning_rate": 3.97271413218159e-06,
+ "loss": 0.4664,
+ "step": 451
+ },
+ {
+ "epoch": 2.1316614420062696,
+ "grad_norm": 0.47842296957969666,
+ "learning_rate": 3.9672936842070425e-06,
+ "loss": 0.4604,
+ "step": 452
+ },
+ {
+ "epoch": 2.1363636363636362,
+ "grad_norm": 0.506851077079773,
+ "learning_rate": 3.9618626933028086e-06,
+ "loss": 0.4674,
+ "step": 453
+ },
+ {
+ "epoch": 2.1410658307210033,
+ "grad_norm": 0.4922677278518677,
+ "learning_rate": 3.956421198492128e-06,
+ "loss": 0.4476,
+ "step": 454
+ },
+ {
+ "epoch": 2.14576802507837,
+ "grad_norm": 0.5307339429855347,
+ "learning_rate": 3.950969238873714e-06,
+ "loss": 0.4463,
+ "step": 455
+ },
+ {
+ "epoch": 2.1504702194357366,
+ "grad_norm": 0.5131121873855591,
+ "learning_rate": 3.9455068536214765e-06,
+ "loss": 0.4779,
+ "step": 456
+ },
+ {
+ "epoch": 2.1551724137931036,
+ "grad_norm": 0.5438089966773987,
+ "learning_rate": 3.9400340819842335e-06,
+ "loss": 0.4563,
+ "step": 457
+ },
+ {
+ "epoch": 2.1598746081504703,
+ "grad_norm": 0.7426711916923523,
+ "learning_rate": 3.934550963285432e-06,
+ "loss": 0.4561,
+ "step": 458
+ },
+ {
+ "epoch": 2.164576802507837,
+ "grad_norm": 0.482920378446579,
+ "learning_rate": 3.9290575369228664e-06,
+ "loss": 0.4293,
+ "step": 459
+ },
+ {
+ "epoch": 2.169278996865204,
+ "grad_norm": 0.6583715081214905,
+ "learning_rate": 3.923553842368396e-06,
+ "loss": 0.4682,
+ "step": 460
+ },
+ {
+ "epoch": 2.1739811912225706,
+ "grad_norm": 0.47901806235313416,
+ "learning_rate": 3.918039919167658e-06,
+ "loss": 0.4342,
+ "step": 461
+ },
+ {
+ "epoch": 2.1786833855799372,
+ "grad_norm": 0.4929746389389038,
+ "learning_rate": 3.912515806939786e-06,
+ "loss": 0.4478,
+ "step": 462
+ },
+ {
+ "epoch": 2.183385579937304,
+ "grad_norm": 0.48205333948135376,
+ "learning_rate": 3.906981545377124e-06,
+ "loss": 0.4595,
+ "step": 463
+ },
+ {
+ "epoch": 2.188087774294671,
+ "grad_norm": 0.5059337019920349,
+ "learning_rate": 3.901437174244943e-06,
+ "loss": 0.4294,
+ "step": 464
+ },
+ {
+ "epoch": 2.1927899686520376,
+ "grad_norm": 0.4752981662750244,
+ "learning_rate": 3.895882733381154e-06,
+ "loss": 0.448,
+ "step": 465
+ },
+ {
+ "epoch": 2.197492163009404,
+ "grad_norm": 0.5249196290969849,
+ "learning_rate": 3.890318262696023e-06,
+ "loss": 0.4655,
+ "step": 466
+ },
+ {
+ "epoch": 2.2021943573667713,
+ "grad_norm": 0.48044726252555847,
+ "learning_rate": 3.8847438021718805e-06,
+ "loss": 0.4413,
+ "step": 467
+ },
+ {
+ "epoch": 2.206896551724138,
+ "grad_norm": 0.84516841173172,
+ "learning_rate": 3.879159391862839e-06,
+ "loss": 0.4645,
+ "step": 468
+ },
+ {
+ "epoch": 2.2115987460815045,
+ "grad_norm": 0.5334392786026001,
+ "learning_rate": 3.873565071894503e-06,
+ "loss": 0.4347,
+ "step": 469
+ },
+ {
+ "epoch": 2.2163009404388716,
+ "grad_norm": 0.5113687515258789,
+ "learning_rate": 3.86796088246368e-06,
+ "loss": 0.4314,
+ "step": 470
+ },
+ {
+ "epoch": 2.2210031347962382,
+ "grad_norm": 0.5226101279258728,
+ "learning_rate": 3.8623468638380905e-06,
+ "loss": 0.418,
+ "step": 471
+ },
+ {
+ "epoch": 2.225705329153605,
+ "grad_norm": 0.4901522099971771,
+ "learning_rate": 3.856723056356085e-06,
+ "loss": 0.4597,
+ "step": 472
+ },
+ {
+ "epoch": 2.230407523510972,
+ "grad_norm": 0.5312012434005737,
+ "learning_rate": 3.851089500426346e-06,
+ "loss": 0.4444,
+ "step": 473
+ },
+ {
+ "epoch": 2.2351097178683386,
+ "grad_norm": 0.5347906351089478,
+ "learning_rate": 3.845446236527605e-06,
+ "loss": 0.4447,
+ "step": 474
+ },
+ {
+ "epoch": 2.239811912225705,
+ "grad_norm": 0.4781494438648224,
+ "learning_rate": 3.8397933052083445e-06,
+ "loss": 0.462,
+ "step": 475
+ },
+ {
+ "epoch": 2.2445141065830723,
+ "grad_norm": 0.5215012431144714,
+ "learning_rate": 3.834130747086512e-06,
+ "loss": 0.4475,
+ "step": 476
+ },
+ {
+ "epoch": 2.249216300940439,
+ "grad_norm": 0.5048666000366211,
+ "learning_rate": 3.828458602849226e-06,
+ "loss": 0.4483,
+ "step": 477
+ },
+ {
+ "epoch": 2.2539184952978055,
+ "grad_norm": 0.5508173108100891,
+ "learning_rate": 3.822776913252485e-06,
+ "loss": 0.4511,
+ "step": 478
+ },
+ {
+ "epoch": 2.2586206896551726,
+ "grad_norm": 0.5031043887138367,
+ "learning_rate": 3.817085719120872e-06,
+ "loss": 0.4019,
+ "step": 479
+ },
+ {
+ "epoch": 2.2633228840125392,
+ "grad_norm": 0.508939802646637,
+ "learning_rate": 3.811385061347263e-06,
+ "loss": 0.4461,
+ "step": 480
+ },
+ {
+ "epoch": 2.268025078369906,
+ "grad_norm": 0.5605170726776123,
+ "learning_rate": 3.805674980892535e-06,
+ "loss": 0.4695,
+ "step": 481
+ },
+ {
+ "epoch": 2.2727272727272725,
+ "grad_norm": 0.5526806712150574,
+ "learning_rate": 3.7999555187852667e-06,
+ "loss": 0.4575,
+ "step": 482
+ },
+ {
+ "epoch": 2.2774294670846396,
+ "grad_norm": 0.47659724950790405,
+ "learning_rate": 3.7942267161214497e-06,
+ "loss": 0.4433,
+ "step": 483
+ },
+ {
+ "epoch": 2.282131661442006,
+ "grad_norm": 0.49713975191116333,
+ "learning_rate": 3.7884886140641884e-06,
+ "loss": 0.4692,
+ "step": 484
+ },
+ {
+ "epoch": 2.2868338557993733,
+ "grad_norm": 0.48685988783836365,
+ "learning_rate": 3.7827412538434062e-06,
+ "loss": 0.4328,
+ "step": 485
+ },
+ {
+ "epoch": 2.29153605015674,
+ "grad_norm": 0.5074832439422607,
+ "learning_rate": 3.7769846767555495e-06,
+ "loss": 0.4598,
+ "step": 486
+ },
+ {
+ "epoch": 2.2962382445141065,
+ "grad_norm": 0.5333994030952454,
+ "learning_rate": 3.7712189241632898e-06,
+ "loss": 0.4554,
+ "step": 487
+ },
+ {
+ "epoch": 2.300940438871473,
+ "grad_norm": 0.49985551834106445,
+ "learning_rate": 3.7654440374952288e-06,
+ "loss": 0.4421,
+ "step": 488
+ },
+ {
+ "epoch": 2.30564263322884,
+ "grad_norm": 0.4791257679462433,
+ "learning_rate": 3.7596600582455976e-06,
+ "loss": 0.4187,
+ "step": 489
+ },
+ {
+ "epoch": 2.310344827586207,
+ "grad_norm": 0.4951220154762268,
+ "learning_rate": 3.75386702797396e-06,
+ "loss": 0.4205,
+ "step": 490
+ },
+ {
+ "epoch": 2.3150470219435735,
+ "grad_norm": 0.4765990674495697,
+ "learning_rate": 3.7480649883049164e-06,
+ "loss": 0.4251,
+ "step": 491
+ },
+ {
+ "epoch": 2.3197492163009406,
+ "grad_norm": 0.5125405192375183,
+ "learning_rate": 3.7422539809277993e-06,
+ "loss": 0.4361,
+ "step": 492
+ },
+ {
+ "epoch": 2.324451410658307,
+ "grad_norm": 0.5286112427711487,
+ "learning_rate": 3.736434047596379e-06,
+ "loss": 0.4423,
+ "step": 493
+ },
+ {
+ "epoch": 2.329153605015674,
+ "grad_norm": 0.47961002588272095,
+ "learning_rate": 3.73060523012856e-06,
+ "loss": 0.453,
+ "step": 494
+ },
+ {
+ "epoch": 2.333855799373041,
+ "grad_norm": 0.5857998728752136,
+ "learning_rate": 3.724767570406082e-06,
+ "loss": 0.4674,
+ "step": 495
+ },
+ {
+ "epoch": 2.3385579937304075,
+ "grad_norm": 0.5348326563835144,
+ "learning_rate": 3.7189211103742206e-06,
+ "loss": 0.4267,
+ "step": 496
+ },
+ {
+ "epoch": 2.343260188087774,
+ "grad_norm": 0.4718475937843323,
+ "learning_rate": 3.7130658920414818e-06,
+ "loss": 0.4619,
+ "step": 497
+ },
+ {
+ "epoch": 2.347962382445141,
+ "grad_norm": 0.44225215911865234,
+ "learning_rate": 3.7072019574793034e-06,
+ "loss": 0.4712,
+ "step": 498
+ },
+ {
+ "epoch": 2.352664576802508,
+ "grad_norm": 0.48492008447647095,
+ "learning_rate": 3.701329348821752e-06,
+ "loss": 0.4521,
+ "step": 499
+ },
+ {
+ "epoch": 2.3573667711598745,
+ "grad_norm": 0.49741214513778687,
+ "learning_rate": 3.695448108265221e-06,
+ "loss": 0.4378,
+ "step": 500
+ },
+ {
+ "epoch": 2.3620689655172415,
+ "grad_norm": 0.5086454749107361,
+ "learning_rate": 3.6895582780681254e-06,
+ "loss": 0.4349,
+ "step": 501
+ },
+ {
+ "epoch": 2.366771159874608,
+ "grad_norm": 0.49111631512641907,
+ "learning_rate": 3.683659900550598e-06,
+ "loss": 0.4625,
+ "step": 502
+ },
+ {
+ "epoch": 2.371473354231975,
+ "grad_norm": 0.5006322264671326,
+ "learning_rate": 3.6777530180941894e-06,
+ "loss": 0.4457,
+ "step": 503
+ },
+ {
+ "epoch": 2.376175548589342,
+ "grad_norm": 0.5934097170829773,
+ "learning_rate": 3.671837673141559e-06,
+ "loss": 0.4306,
+ "step": 504
+ },
+ {
+ "epoch": 2.3808777429467085,
+ "grad_norm": 0.626039981842041,
+ "learning_rate": 3.6659139081961707e-06,
+ "loss": 0.4464,
+ "step": 505
+ },
+ {
+ "epoch": 2.385579937304075,
+ "grad_norm": 0.4751131236553192,
+ "learning_rate": 3.6599817658219916e-06,
+ "loss": 0.4508,
+ "step": 506
+ },
+ {
+ "epoch": 2.3902821316614418,
+ "grad_norm": 1.4542276859283447,
+ "learning_rate": 3.6540412886431796e-06,
+ "loss": 0.4606,
+ "step": 507
+ },
+ {
+ "epoch": 2.394984326018809,
+ "grad_norm": 0.5189768075942993,
+ "learning_rate": 3.648092519343783e-06,
+ "loss": 0.4435,
+ "step": 508
+ },
+ {
+ "epoch": 2.3996865203761755,
+ "grad_norm": 1.4583938121795654,
+ "learning_rate": 3.642135500667431e-06,
+ "loss": 0.4314,
+ "step": 509
+ },
+ {
+ "epoch": 2.4043887147335425,
+ "grad_norm": 0.5038107633590698,
+ "learning_rate": 3.6361702754170247e-06,
+ "loss": 0.4463,
+ "step": 510
+ },
+ {
+ "epoch": 2.409090909090909,
+ "grad_norm": 0.5786447525024414,
+ "learning_rate": 3.630196886454435e-06,
+ "loss": 0.4281,
+ "step": 511
+ },
+ {
+ "epoch": 2.413793103448276,
+ "grad_norm": 0.48684218525886536,
+ "learning_rate": 3.62421537670019e-06,
+ "loss": 0.4432,
+ "step": 512
+ },
+ {
+ "epoch": 2.4184952978056424,
+ "grad_norm": 0.5117013454437256,
+ "learning_rate": 3.618225789133167e-06,
+ "loss": 0.4464,
+ "step": 513
+ },
+ {
+ "epoch": 2.4231974921630095,
+ "grad_norm": 0.49249181151390076,
+ "learning_rate": 3.612228166790287e-06,
+ "loss": 0.4465,
+ "step": 514
+ },
+ {
+ "epoch": 2.427899686520376,
+ "grad_norm": 0.5761134624481201,
+ "learning_rate": 3.606222552766201e-06,
+ "loss": 0.4539,
+ "step": 515
+ },
+ {
+ "epoch": 2.4326018808777428,
+ "grad_norm": 0.4839339256286621,
+ "learning_rate": 3.6002089902129844e-06,
+ "loss": 0.4469,
+ "step": 516
+ },
+ {
+ "epoch": 2.43730407523511,
+ "grad_norm": 0.4765976369380951,
+ "learning_rate": 3.5941875223398225e-06,
+ "loss": 0.4379,
+ "step": 517
+ },
+ {
+ "epoch": 2.4420062695924765,
+ "grad_norm": 0.5239338874816895,
+ "learning_rate": 3.588158192412707e-06,
+ "loss": 0.4354,
+ "step": 518
+ },
+ {
+ "epoch": 2.446708463949843,
+ "grad_norm": 0.48244595527648926,
+ "learning_rate": 3.582121043754116e-06,
+ "loss": 0.438,
+ "step": 519
+ },
+ {
+ "epoch": 2.45141065830721,
+ "grad_norm": 0.4641244411468506,
+ "learning_rate": 3.5760761197427097e-06,
+ "loss": 0.438,
+ "step": 520
+ },
+ {
+ "epoch": 2.456112852664577,
+ "grad_norm": 0.48468074202537537,
+ "learning_rate": 3.570023463813017e-06,
+ "loss": 0.4306,
+ "step": 521
+ },
+ {
+ "epoch": 2.4608150470219434,
+ "grad_norm": 0.48626402020454407,
+ "learning_rate": 3.5639631194551216e-06,
+ "loss": 0.4531,
+ "step": 522
+ },
+ {
+ "epoch": 2.4655172413793105,
+ "grad_norm": 0.5581764578819275,
+ "learning_rate": 3.557895130214352e-06,
+ "loss": 0.4451,
+ "step": 523
+ },
+ {
+ "epoch": 2.470219435736677,
+ "grad_norm": 0.6739279627799988,
+ "learning_rate": 3.5518195396909653e-06,
+ "loss": 0.4636,
+ "step": 524
+ },
+ {
+ "epoch": 2.4749216300940438,
+ "grad_norm": 0.550710916519165,
+ "learning_rate": 3.5457363915398384e-06,
+ "loss": 0.4513,
+ "step": 525
+ },
+ {
+ "epoch": 2.479623824451411,
+ "grad_norm": 0.479632705450058,
+ "learning_rate": 3.539645729470151e-06,
+ "loss": 0.4387,
+ "step": 526
+ },
+ {
+ "epoch": 2.4843260188087775,
+ "grad_norm": 0.48741331696510315,
+ "learning_rate": 3.5335475972450715e-06,
+ "loss": 0.4388,
+ "step": 527
+ },
+ {
+ "epoch": 2.489028213166144,
+ "grad_norm": 0.4964964985847473,
+ "learning_rate": 3.5274420386814458e-06,
+ "loss": 0.4643,
+ "step": 528
+ },
+ {
+ "epoch": 2.493730407523511,
+ "grad_norm": 0.5134934186935425,
+ "learning_rate": 3.521329097649478e-06,
+ "loss": 0.4454,
+ "step": 529
+ },
+ {
+ "epoch": 2.498432601880878,
+ "grad_norm": 0.4962058961391449,
+ "learning_rate": 3.515208818072418e-06,
+ "loss": 0.4408,
+ "step": 530
+ },
+ {
+ "epoch": 2.5031347962382444,
+ "grad_norm": 0.5611489415168762,
+ "learning_rate": 3.509081243926247e-06,
+ "loss": 0.4306,
+ "step": 531
+ },
+ {
+ "epoch": 2.507836990595611,
+ "grad_norm": 0.7012472748756409,
+ "learning_rate": 3.5029464192393557e-06,
+ "loss": 0.4614,
+ "step": 532
+ },
+ {
+ "epoch": 2.512539184952978,
+ "grad_norm": 0.5351004004478455,
+ "learning_rate": 3.4968043880922363e-06,
+ "loss": 0.4151,
+ "step": 533
+ },
+ {
+ "epoch": 2.5172413793103448,
+ "grad_norm": 0.5087808966636658,
+ "learning_rate": 3.4906551946171603e-06,
+ "loss": 0.4242,
+ "step": 534
+ },
+ {
+ "epoch": 2.521943573667712,
+ "grad_norm": 0.5459093451499939,
+ "learning_rate": 3.484498882997861e-06,
+ "loss": 0.4215,
+ "step": 535
+ },
+ {
+ "epoch": 2.5266457680250785,
+ "grad_norm": 0.49804285168647766,
+ "learning_rate": 3.478335497469219e-06,
+ "loss": 0.4492,
+ "step": 536
+ },
+ {
+ "epoch": 2.531347962382445,
+ "grad_norm": 0.4959704875946045,
+ "learning_rate": 3.472165082316943e-06,
+ "loss": 0.4511,
+ "step": 537
+ },
+ {
+ "epoch": 2.5360501567398117,
+ "grad_norm": 0.5059382319450378,
+ "learning_rate": 3.465987681877251e-06,
+ "loss": 0.4419,
+ "step": 538
+ },
+ {
+ "epoch": 2.540752351097179,
+ "grad_norm": 0.7398380637168884,
+ "learning_rate": 3.4598033405365527e-06,
+ "loss": 0.4548,
+ "step": 539
+ },
+ {
+ "epoch": 2.5454545454545454,
+ "grad_norm": 0.5326687693595886,
+ "learning_rate": 3.45361210273113e-06,
+ "loss": 0.4473,
+ "step": 540
+ },
+ {
+ "epoch": 2.5501567398119125,
+ "grad_norm": 0.5069761872291565,
+ "learning_rate": 3.447414012946818e-06,
+ "loss": 0.4343,
+ "step": 541
+ },
+ {
+ "epoch": 2.554858934169279,
+ "grad_norm": 0.45915964245796204,
+ "learning_rate": 3.4412091157186853e-06,
+ "loss": 0.4499,
+ "step": 542
+ },
+ {
+ "epoch": 2.5595611285266457,
+ "grad_norm": 0.5174360275268555,
+ "learning_rate": 3.4349974556307146e-06,
+ "loss": 0.44,
+ "step": 543
+ },
+ {
+ "epoch": 2.5642633228840124,
+ "grad_norm": 0.5008105039596558,
+ "learning_rate": 3.4287790773154807e-06,
+ "loss": 0.4648,
+ "step": 544
+ },
+ {
+ "epoch": 2.5689655172413794,
+ "grad_norm": 0.5628801584243774,
+ "learning_rate": 3.4225540254538297e-06,
+ "loss": 0.462,
+ "step": 545
+ },
+ {
+ "epoch": 2.573667711598746,
+ "grad_norm": 0.9913654923439026,
+ "learning_rate": 3.416322344774562e-06,
+ "loss": 0.4403,
+ "step": 546
+ },
+ {
+ "epoch": 2.5783699059561127,
+ "grad_norm": 0.5034172534942627,
+ "learning_rate": 3.4100840800541055e-06,
+ "loss": 0.4622,
+ "step": 547
+ },
+ {
+ "epoch": 2.58307210031348,
+ "grad_norm": 0.495516836643219,
+ "learning_rate": 3.4038392761161986e-06,
+ "loss": 0.4523,
+ "step": 548
+ },
+ {
+ "epoch": 2.5877742946708464,
+ "grad_norm": 0.48142367601394653,
+ "learning_rate": 3.3975879778315634e-06,
+ "loss": 0.4242,
+ "step": 549
+ },
+ {
+ "epoch": 2.592476489028213,
+ "grad_norm": 0.4635900557041168,
+ "learning_rate": 3.391330230117587e-06,
+ "loss": 0.3949,
+ "step": 550
+ },
+ {
+ "epoch": 2.5971786833855797,
+ "grad_norm": 0.4769044816493988,
+ "learning_rate": 3.385066077937997e-06,
+ "loss": 0.4651,
+ "step": 551
+ },
+ {
+ "epoch": 2.6018808777429467,
+ "grad_norm": 1.059553861618042,
+ "learning_rate": 3.378795566302541e-06,
+ "loss": 0.4243,
+ "step": 552
+ },
+ {
+ "epoch": 2.6065830721003134,
+ "grad_norm": 0.512134850025177,
+ "learning_rate": 3.372518740266658e-06,
+ "loss": 0.4435,
+ "step": 553
+ },
+ {
+ "epoch": 2.6112852664576804,
+ "grad_norm": 0.5267173647880554,
+ "learning_rate": 3.36623564493116e-06,
+ "loss": 0.4558,
+ "step": 554
+ },
+ {
+ "epoch": 2.615987460815047,
+ "grad_norm": 0.49343907833099365,
+ "learning_rate": 3.3599463254419047e-06,
+ "loss": 0.4598,
+ "step": 555
+ },
+ {
+ "epoch": 2.6206896551724137,
+ "grad_norm": 0.5496839284896851,
+ "learning_rate": 3.3536508269894724e-06,
+ "loss": 0.4669,
+ "step": 556
+ },
+ {
+ "epoch": 2.6253918495297803,
+ "grad_norm": 0.5957831740379333,
+ "learning_rate": 3.347349194808842e-06,
+ "loss": 0.4533,
+ "step": 557
+ },
+ {
+ "epoch": 2.6300940438871474,
+ "grad_norm": 0.5049230456352234,
+ "learning_rate": 3.3410414741790625e-06,
+ "loss": 0.4293,
+ "step": 558
+ },
+ {
+ "epoch": 2.634796238244514,
+ "grad_norm": 0.5167728066444397,
+ "learning_rate": 3.3347277104229332e-06,
+ "loss": 0.443,
+ "step": 559
+ },
+ {
+ "epoch": 2.639498432601881,
+ "grad_norm": 0.6090758442878723,
+ "learning_rate": 3.3284079489066728e-06,
+ "loss": 0.4378,
+ "step": 560
+ },
+ {
+ "epoch": 2.6442006269592477,
+ "grad_norm": 0.5165027379989624,
+ "learning_rate": 3.3220822350395966e-06,
+ "loss": 0.4302,
+ "step": 561
+ },
+ {
+ "epoch": 2.6489028213166144,
+ "grad_norm": 0.5152680277824402,
+ "learning_rate": 3.31575061427379e-06,
+ "loss": 0.4311,
+ "step": 562
+ },
+ {
+ "epoch": 2.653605015673981,
+ "grad_norm": 0.547235906124115,
+ "learning_rate": 3.3094131321037783e-06,
+ "loss": 0.4371,
+ "step": 563
+ },
+ {
+ "epoch": 2.658307210031348,
+ "grad_norm": 0.521981418132782,
+ "learning_rate": 3.303069834066206e-06,
+ "loss": 0.4346,
+ "step": 564
+ },
+ {
+ "epoch": 2.6630094043887147,
+ "grad_norm": 0.5127217769622803,
+ "learning_rate": 3.2967207657395055e-06,
+ "loss": 0.474,
+ "step": 565
+ },
+ {
+ "epoch": 2.6677115987460818,
+ "grad_norm": 0.5210872888565063,
+ "learning_rate": 3.2903659727435692e-06,
+ "loss": 0.4622,
+ "step": 566
+ },
+ {
+ "epoch": 2.6724137931034484,
+ "grad_norm": 0.5768873691558838,
+ "learning_rate": 3.284005500739423e-06,
+ "loss": 0.4556,
+ "step": 567
+ },
+ {
+ "epoch": 2.677115987460815,
+ "grad_norm": 0.5305764675140381,
+ "learning_rate": 3.2776393954289e-06,
+ "loss": 0.429,
+ "step": 568
+ },
+ {
+ "epoch": 2.6818181818181817,
+ "grad_norm": 0.5312129855155945,
+ "learning_rate": 3.271267702554307e-06,
+ "loss": 0.4208,
+ "step": 569
+ },
+ {
+ "epoch": 2.6865203761755487,
+ "grad_norm": 0.5433884859085083,
+ "learning_rate": 3.2648904678981032e-06,
+ "loss": 0.4647,
+ "step": 570
+ },
+ {
+ "epoch": 2.6912225705329154,
+ "grad_norm": 1.2331725358963013,
+ "learning_rate": 3.2585077372825636e-06,
+ "loss": 0.4126,
+ "step": 571
+ },
+ {
+ "epoch": 2.695924764890282,
+ "grad_norm": 0.5495198369026184,
+ "learning_rate": 3.2521195565694543e-06,
+ "loss": 0.4453,
+ "step": 572
+ },
+ {
+ "epoch": 2.700626959247649,
+ "grad_norm": 0.5230907201766968,
+ "learning_rate": 3.2457259716597023e-06,
+ "loss": 0.446,
+ "step": 573
+ },
+ {
+ "epoch": 2.7053291536050157,
+ "grad_norm": 0.4807503819465637,
+ "learning_rate": 3.2393270284930658e-06,
+ "loss": 0.4547,
+ "step": 574
+ },
+ {
+ "epoch": 2.7100313479623823,
+ "grad_norm": 0.5169614553451538,
+ "learning_rate": 3.2329227730478026e-06,
+ "loss": 0.4319,
+ "step": 575
+ },
+ {
+ "epoch": 2.714733542319749,
+ "grad_norm": 0.502966046333313,
+ "learning_rate": 3.2265132513403415e-06,
+ "loss": 0.4196,
+ "step": 576
+ },
+ {
+ "epoch": 2.719435736677116,
+ "grad_norm": 0.5387672781944275,
+ "learning_rate": 3.22009850942495e-06,
+ "loss": 0.4449,
+ "step": 577
+ },
+ {
+ "epoch": 2.7241379310344827,
+ "grad_norm": 0.5503709316253662,
+ "learning_rate": 3.213678593393405e-06,
+ "loss": 0.4589,
+ "step": 578
+ },
+ {
+ "epoch": 2.7288401253918497,
+ "grad_norm": 0.5165039300918579,
+ "learning_rate": 3.207253549374662e-06,
+ "loss": 0.4578,
+ "step": 579
+ },
+ {
+ "epoch": 2.7335423197492164,
+ "grad_norm": 0.5894023180007935,
+ "learning_rate": 3.200823423534519e-06,
+ "loss": 0.4448,
+ "step": 580
+ },
+ {
+ "epoch": 2.738244514106583,
+ "grad_norm": 0.5234156250953674,
+ "learning_rate": 3.194388262075293e-06,
+ "loss": 0.4504,
+ "step": 581
+ },
+ {
+ "epoch": 2.7429467084639496,
+ "grad_norm": 0.47498077154159546,
+ "learning_rate": 3.1879481112354804e-06,
+ "loss": 0.4471,
+ "step": 582
+ },
+ {
+ "epoch": 2.7476489028213167,
+ "grad_norm": 0.5213322043418884,
+ "learning_rate": 3.181503017289428e-06,
+ "loss": 0.4096,
+ "step": 583
+ },
+ {
+ "epoch": 2.7523510971786833,
+ "grad_norm": 0.5031464695930481,
+ "learning_rate": 3.175053026547002e-06,
+ "loss": 0.416,
+ "step": 584
+ },
+ {
+ "epoch": 2.7570532915360504,
+ "grad_norm": 0.7983574867248535,
+ "learning_rate": 3.16859818535325e-06,
+ "loss": 0.457,
+ "step": 585
+ },
+ {
+ "epoch": 2.761755485893417,
+ "grad_norm": 0.47774994373321533,
+ "learning_rate": 3.1621385400880756e-06,
+ "loss": 0.4529,
+ "step": 586
+ },
+ {
+ "epoch": 2.7664576802507836,
+ "grad_norm": 0.8216882348060608,
+ "learning_rate": 3.1556741371658984e-06,
+ "loss": 0.4559,
+ "step": 587
+ },
+ {
+ "epoch": 2.7711598746081503,
+ "grad_norm": 0.5124049186706543,
+ "learning_rate": 3.1492050230353238e-06,
+ "loss": 0.4438,
+ "step": 588
+ },
+ {
+ "epoch": 2.7758620689655173,
+ "grad_norm": 0.5410915017127991,
+ "learning_rate": 3.142731244178809e-06,
+ "loss": 0.4195,
+ "step": 589
+ },
+ {
+ "epoch": 2.780564263322884,
+ "grad_norm": 0.5318175554275513,
+ "learning_rate": 3.1362528471123277e-06,
+ "loss": 0.4046,
+ "step": 590
+ },
+ {
+ "epoch": 2.785266457680251,
+ "grad_norm": 0.6133676171302795,
+ "learning_rate": 3.129769878385039e-06,
+ "loss": 0.4098,
+ "step": 591
+ },
+ {
+ "epoch": 2.7899686520376177,
+ "grad_norm": 0.4698888063430786,
+ "learning_rate": 3.1232823845789473e-06,
+ "loss": 0.4508,
+ "step": 592
+ },
+ {
+ "epoch": 2.7946708463949843,
+ "grad_norm": 0.6980767250061035,
+ "learning_rate": 3.1167904123085736e-06,
+ "loss": 0.455,
+ "step": 593
+ },
+ {
+ "epoch": 2.799373040752351,
+ "grad_norm": 0.5151284337043762,
+ "learning_rate": 3.110294008220617e-06,
+ "loss": 0.4431,
+ "step": 594
+ },
+ {
+ "epoch": 2.804075235109718,
+ "grad_norm": 0.47901320457458496,
+ "learning_rate": 3.1037932189936205e-06,
+ "loss": 0.4406,
+ "step": 595
+ },
+ {
+ "epoch": 2.8087774294670846,
+ "grad_norm": 0.5079891085624695,
+ "learning_rate": 3.097288091337635e-06,
+ "loss": 0.4351,
+ "step": 596
+ },
+ {
+ "epoch": 2.8134796238244513,
+ "grad_norm": 0.5278874635696411,
+ "learning_rate": 3.0907786719938876e-06,
+ "loss": 0.4264,
+ "step": 597
+ },
+ {
+ "epoch": 2.8181818181818183,
+ "grad_norm": 0.47123396396636963,
+ "learning_rate": 3.084265007734436e-06,
+ "loss": 0.434,
+ "step": 598
+ },
+ {
+ "epoch": 2.822884012539185,
+ "grad_norm": 0.5229635834693909,
+ "learning_rate": 3.0777471453618457e-06,
+ "loss": 0.4602,
+ "step": 599
+ },
+ {
+ "epoch": 2.8275862068965516,
+ "grad_norm": 0.47847074270248413,
+ "learning_rate": 3.0712251317088426e-06,
+ "loss": 0.4317,
+ "step": 600
+ },
+ {
+ "epoch": 2.8322884012539182,
+ "grad_norm": 0.7754543423652649,
+ "learning_rate": 3.064699013637983e-06,
+ "loss": 0.4528,
+ "step": 601
+ },
+ {
+ "epoch": 2.8369905956112853,
+ "grad_norm": 0.5581084489822388,
+ "learning_rate": 3.0581688380413115e-06,
+ "loss": 0.4369,
+ "step": 602
+ },
+ {
+ "epoch": 2.841692789968652,
+ "grad_norm": 0.588622510433197,
+ "learning_rate": 3.0516346518400315e-06,
+ "loss": 0.4517,
+ "step": 603
+ },
+ {
+ "epoch": 2.846394984326019,
+ "grad_norm": 0.565423846244812,
+ "learning_rate": 3.0450965019841593e-06,
+ "loss": 0.4517,
+ "step": 604
+ },
+ {
+ "epoch": 2.8510971786833856,
+ "grad_norm": 0.47801777720451355,
+ "learning_rate": 3.0385544354521957e-06,
+ "loss": 0.4161,
+ "step": 605
+ },
+ {
+ "epoch": 2.8557993730407523,
+ "grad_norm": 0.5034862756729126,
+ "learning_rate": 3.0320084992507814e-06,
+ "loss": 0.4428,
+ "step": 606
+ },
+ {
+ "epoch": 2.860501567398119,
+ "grad_norm": 0.5339663624763489,
+ "learning_rate": 3.0254587404143604e-06,
+ "loss": 0.4792,
+ "step": 607
+ },
+ {
+ "epoch": 2.865203761755486,
+ "grad_norm": 0.48184943199157715,
+ "learning_rate": 3.0189052060048464e-06,
+ "loss": 0.4409,
+ "step": 608
+ },
+ {
+ "epoch": 2.8699059561128526,
+ "grad_norm": 0.5102176070213318,
+ "learning_rate": 3.01234794311128e-06,
+ "loss": 0.438,
+ "step": 609
+ },
+ {
+ "epoch": 2.8746081504702197,
+ "grad_norm": 0.5111781358718872,
+ "learning_rate": 3.0057869988494925e-06,
+ "loss": 0.4617,
+ "step": 610
+ },
+ {
+ "epoch": 2.8793103448275863,
+ "grad_norm": 0.5915101766586304,
+ "learning_rate": 2.999222420361767e-06,
+ "loss": 0.4532,
+ "step": 611
+ },
+ {
+ "epoch": 2.884012539184953,
+ "grad_norm": 0.48898932337760925,
+ "learning_rate": 2.9926542548165e-06,
+ "loss": 0.4663,
+ "step": 612
+ },
+ {
+ "epoch": 2.8887147335423196,
+ "grad_norm": 0.4943861961364746,
+ "learning_rate": 2.9860825494078605e-06,
+ "loss": 0.4354,
+ "step": 613
+ },
+ {
+ "epoch": 2.8934169278996866,
+ "grad_norm": 0.5398025512695312,
+ "learning_rate": 2.979507351355454e-06,
+ "loss": 0.4546,
+ "step": 614
+ },
+ {
+ "epoch": 2.8981191222570533,
+ "grad_norm": 0.545421302318573,
+ "learning_rate": 2.972928707903981e-06,
+ "loss": 0.4404,
+ "step": 615
+ },
+ {
+ "epoch": 2.9028213166144203,
+ "grad_norm": 0.5370550751686096,
+ "learning_rate": 2.966346666322898e-06,
+ "loss": 0.4401,
+ "step": 616
+ },
+ {
+ "epoch": 2.907523510971787,
+ "grad_norm": 0.5280672311782837,
+ "learning_rate": 2.9597612739060775e-06,
+ "loss": 0.4172,
+ "step": 617
+ },
+ {
+ "epoch": 2.9122257053291536,
+ "grad_norm": 0.5043423175811768,
+ "learning_rate": 2.9531725779714713e-06,
+ "loss": 0.4487,
+ "step": 618
+ },
+ {
+ "epoch": 2.91692789968652,
+ "grad_norm": 1.961200475692749,
+ "learning_rate": 2.9465806258607653e-06,
+ "loss": 0.4548,
+ "step": 619
+ },
+ {
+ "epoch": 2.9216300940438873,
+ "grad_norm": 0.5286726355552673,
+ "learning_rate": 2.939985464939043e-06,
+ "loss": 0.4566,
+ "step": 620
+ },
+ {
+ "epoch": 2.926332288401254,
+ "grad_norm": 0.5209453105926514,
+ "learning_rate": 2.9333871425944434e-06,
+ "loss": 0.4064,
+ "step": 621
+ },
+ {
+ "epoch": 2.9310344827586206,
+ "grad_norm": 0.47711747884750366,
+ "learning_rate": 2.926785706237822e-06,
+ "loss": 0.4341,
+ "step": 622
+ },
+ {
+ "epoch": 2.9357366771159876,
+ "grad_norm": 0.45926427841186523,
+ "learning_rate": 2.920181203302409e-06,
+ "loss": 0.4256,
+ "step": 623
+ },
+ {
+ "epoch": 2.9404388714733543,
+ "grad_norm": 0.5624600648880005,
+ "learning_rate": 2.91357368124347e-06,
+ "loss": 0.4252,
+ "step": 624
+ },
+ {
+ "epoch": 2.945141065830721,
+ "grad_norm": 0.5101850628852844,
+ "learning_rate": 2.906963187537962e-06,
+ "loss": 0.4352,
+ "step": 625
+ },
+ {
+ "epoch": 2.9498432601880875,
+ "grad_norm": 0.5341358184814453,
+ "learning_rate": 2.9003497696841955e-06,
+ "loss": 0.4132,
+ "step": 626
+ },
+ {
+ "epoch": 2.9545454545454546,
+ "grad_norm": 0.5917084217071533,
+ "learning_rate": 2.8937334752014913e-06,
+ "loss": 0.4693,
+ "step": 627
+ },
+ {
+ "epoch": 2.959247648902821,
+ "grad_norm": 0.793695330619812,
+ "learning_rate": 2.887114351629839e-06,
+ "loss": 0.4431,
+ "step": 628
+ },
+ {
+ "epoch": 2.9639498432601883,
+ "grad_norm": 0.5363728404045105,
+ "learning_rate": 2.8804924465295575e-06,
+ "loss": 0.4672,
+ "step": 629
+ },
+ {
+ "epoch": 2.968652037617555,
+ "grad_norm": 0.4979572892189026,
+ "learning_rate": 2.873867807480951e-06,
+ "loss": 0.4723,
+ "step": 630
+ },
+ {
+ "epoch": 2.9733542319749215,
+ "grad_norm": 0.5310130715370178,
+ "learning_rate": 2.8672404820839676e-06,
+ "loss": 0.4388,
+ "step": 631
+ },
+ {
+ "epoch": 2.978056426332288,
+ "grad_norm": 0.530015766620636,
+ "learning_rate": 2.8606105179578584e-06,
+ "loss": 0.4466,
+ "step": 632
+ },
+ {
+ "epoch": 2.9827586206896552,
+ "grad_norm": 0.5356627702713013,
+ "learning_rate": 2.8539779627408332e-06,
+ "loss": 0.4252,
+ "step": 633
+ },
+ {
+ "epoch": 2.987460815047022,
+ "grad_norm": 0.5290245413780212,
+ "learning_rate": 2.847342864089721e-06,
+ "loss": 0.4453,
+ "step": 634
+ },
+ {
+ "epoch": 2.992163009404389,
+ "grad_norm": 0.471682071685791,
+ "learning_rate": 2.8407052696796255e-06,
+ "loss": 0.43,
+ "step": 635
+ },
+ {
+ "epoch": 2.9968652037617556,
+ "grad_norm": 0.5220829844474792,
+ "learning_rate": 2.834065227203584e-06,
+ "loss": 0.4494,
+ "step": 636
+ },
+ {
+ "epoch": 3.0047021943573666,
+ "grad_norm": 0.4797399342060089,
+ "learning_rate": 2.8274227843722213e-06,
+ "loss": 0.8683,
+ "step": 637
+ },
+ {
+ "epoch": 3.0094043887147337,
+ "grad_norm": 0.5463248491287231,
+ "learning_rate": 2.820777988913412e-06,
+ "loss": 0.4157,
+ "step": 638
+ },
+ {
+ "epoch": 3.0141065830721003,
+ "grad_norm": 0.5081924200057983,
+ "learning_rate": 2.8141308885719337e-06,
+ "loss": 0.4169,
+ "step": 639
+ },
+ {
+ "epoch": 3.018808777429467,
+ "grad_norm": 0.4916677474975586,
+ "learning_rate": 2.8074815311091265e-06,
+ "loss": 0.3898,
+ "step": 640
+ },
+ {
+ "epoch": 3.023510971786834,
+ "grad_norm": 0.48858827352523804,
+ "learning_rate": 2.8008299643025477e-06,
+ "loss": 0.4319,
+ "step": 641
+ },
+ {
+ "epoch": 3.0282131661442007,
+ "grad_norm": 0.49183058738708496,
+ "learning_rate": 2.7941762359456294e-06,
+ "loss": 0.4243,
+ "step": 642
+ },
+ {
+ "epoch": 3.0329153605015673,
+ "grad_norm": 0.5068245530128479,
+ "learning_rate": 2.787520393847334e-06,
+ "loss": 0.4168,
+ "step": 643
+ },
+ {
+ "epoch": 3.0376175548589344,
+ "grad_norm": 0.542245090007782,
+ "learning_rate": 2.780862485831814e-06,
+ "loss": 0.4289,
+ "step": 644
+ },
+ {
+ "epoch": 3.042319749216301,
+ "grad_norm": 0.49114999175071716,
+ "learning_rate": 2.7742025597380644e-06,
+ "loss": 0.4337,
+ "step": 645
+ },
+ {
+ "epoch": 3.0470219435736676,
+ "grad_norm": 0.4982999563217163,
+ "learning_rate": 2.7675406634195824e-06,
+ "loss": 0.4207,
+ "step": 646
+ },
+ {
+ "epoch": 3.0517241379310347,
+ "grad_norm": 0.5352709293365479,
+ "learning_rate": 2.7608768447440193e-06,
+ "loss": 0.4087,
+ "step": 647
+ },
+ {
+ "epoch": 3.0564263322884013,
+ "grad_norm": 0.5486279726028442,
+ "learning_rate": 2.754211151592841e-06,
+ "loss": 0.4129,
+ "step": 648
+ },
+ {
+ "epoch": 3.061128526645768,
+ "grad_norm": 0.6048034429550171,
+ "learning_rate": 2.7475436318609827e-06,
+ "loss": 0.433,
+ "step": 649
+ },
+ {
+ "epoch": 3.0658307210031346,
+ "grad_norm": 0.6576470136642456,
+ "learning_rate": 2.7408743334565006e-06,
+ "loss": 0.4086,
+ "step": 650
+ },
+ {
+ "epoch": 3.0705329153605017,
+ "grad_norm": 0.49989938735961914,
+ "learning_rate": 2.734203304300235e-06,
+ "loss": 0.3999,
+ "step": 651
+ },
+ {
+ "epoch": 3.0752351097178683,
+ "grad_norm": 0.5238141417503357,
+ "learning_rate": 2.7275305923254607e-06,
+ "loss": 0.4133,
+ "step": 652
+ },
+ {
+ "epoch": 3.079937304075235,
+ "grad_norm": 0.5244804620742798,
+ "learning_rate": 2.720856245477544e-06,
+ "loss": 0.4016,
+ "step": 653
+ },
+ {
+ "epoch": 3.084639498432602,
+ "grad_norm": 0.5036159753799438,
+ "learning_rate": 2.7141803117135978e-06,
+ "loss": 0.3972,
+ "step": 654
+ },
+ {
+ "epoch": 3.0893416927899686,
+ "grad_norm": 0.5390443801879883,
+ "learning_rate": 2.7075028390021385e-06,
+ "loss": 0.3992,
+ "step": 655
+ },
+ {
+ "epoch": 3.0940438871473352,
+ "grad_norm": 0.5226757526397705,
+ "learning_rate": 2.7008238753227385e-06,
+ "loss": 0.4074,
+ "step": 656
+ },
+ {
+ "epoch": 3.0987460815047023,
+ "grad_norm": 0.48386913537979126,
+ "learning_rate": 2.694143468665685e-06,
+ "loss": 0.4284,
+ "step": 657
+ },
+ {
+ "epoch": 3.103448275862069,
+ "grad_norm": 0.5081993341445923,
+ "learning_rate": 2.6874616670316338e-06,
+ "loss": 0.3952,
+ "step": 658
+ },
+ {
+ "epoch": 3.1081504702194356,
+ "grad_norm": 0.538280189037323,
+ "learning_rate": 2.6807785184312618e-06,
+ "loss": 0.4136,
+ "step": 659
+ },
+ {
+ "epoch": 3.1128526645768027,
+ "grad_norm": 0.7804566621780396,
+ "learning_rate": 2.674094070884926e-06,
+ "loss": 0.4131,
+ "step": 660
+ },
+ {
+ "epoch": 3.1175548589341693,
+ "grad_norm": 0.6693199872970581,
+ "learning_rate": 2.6674083724223166e-06,
+ "loss": 0.4329,
+ "step": 661
+ },
+ {
+ "epoch": 3.122257053291536,
+ "grad_norm": 0.5034769773483276,
+ "learning_rate": 2.6607214710821112e-06,
+ "loss": 0.4062,
+ "step": 662
+ },
+ {
+ "epoch": 3.126959247648903,
+ "grad_norm": 0.5518231391906738,
+ "learning_rate": 2.6540334149116304e-06,
+ "loss": 0.4172,
+ "step": 663
+ },
+ {
+ "epoch": 3.1316614420062696,
+ "grad_norm": 0.5797336101531982,
+ "learning_rate": 2.647344251966493e-06,
+ "loss": 0.4164,
+ "step": 664
+ },
+ {
+ "epoch": 3.1363636363636362,
+ "grad_norm": 0.5404736399650574,
+ "learning_rate": 2.6406540303102714e-06,
+ "loss": 0.4157,
+ "step": 665
+ },
+ {
+ "epoch": 3.1410658307210033,
+ "grad_norm": 0.5246729850769043,
+ "learning_rate": 2.6339627980141425e-06,
+ "loss": 0.4165,
+ "step": 666
+ },
+ {
+ "epoch": 3.14576802507837,
+ "grad_norm": 0.5443553328514099,
+ "learning_rate": 2.6272706031565482e-06,
+ "loss": 0.4022,
+ "step": 667
+ },
+ {
+ "epoch": 3.1504702194357366,
+ "grad_norm": 0.5127459168434143,
+ "learning_rate": 2.6205774938228433e-06,
+ "loss": 0.3983,
+ "step": 668
+ },
+ {
+ "epoch": 3.1551724137931036,
+ "grad_norm": 0.5095480680465698,
+ "learning_rate": 2.6138835181049556e-06,
+ "loss": 0.4227,
+ "step": 669
+ },
+ {
+ "epoch": 3.1598746081504703,
+ "grad_norm": 0.5238015651702881,
+ "learning_rate": 2.6071887241010374e-06,
+ "loss": 0.4056,
+ "step": 670
+ },
+ {
+ "epoch": 3.164576802507837,
+ "grad_norm": 0.5659390687942505,
+ "learning_rate": 2.6004931599151223e-06,
+ "loss": 0.3933,
+ "step": 671
+ },
+ {
+ "epoch": 3.169278996865204,
+ "grad_norm": 0.528191328048706,
+ "learning_rate": 2.593796873656775e-06,
+ "loss": 0.4356,
+ "step": 672
+ },
+ {
+ "epoch": 3.1739811912225706,
+ "grad_norm": 1.1774086952209473,
+ "learning_rate": 2.587099913440749e-06,
+ "loss": 0.4149,
+ "step": 673
+ },
+ {
+ "epoch": 3.1786833855799372,
+ "grad_norm": 0.5629571676254272,
+ "learning_rate": 2.580402327386643e-06,
+ "loss": 0.403,
+ "step": 674
+ },
+ {
+ "epoch": 3.183385579937304,
+ "grad_norm": 1.1260513067245483,
+ "learning_rate": 2.5737041636185496e-06,
+ "loss": 0.4102,
+ "step": 675
+ },
+ {
+ "epoch": 3.188087774294671,
+ "grad_norm": 0.6467511653900146,
+ "learning_rate": 2.5670054702647146e-06,
+ "loss": 0.3948,
+ "step": 676
+ },
+ {
+ "epoch": 3.1927899686520376,
+ "grad_norm": 0.5177720785140991,
+ "learning_rate": 2.5603062954571872e-06,
+ "loss": 0.4188,
+ "step": 677
+ },
+ {
+ "epoch": 3.197492163009404,
+ "grad_norm": 0.5086417198181152,
+ "learning_rate": 2.553606687331477e-06,
+ "loss": 0.4403,
+ "step": 678
+ },
+ {
+ "epoch": 3.2021943573667713,
+ "grad_norm": 0.5762012600898743,
+ "learning_rate": 2.5469066940262073e-06,
+ "loss": 0.4084,
+ "step": 679
+ },
+ {
+ "epoch": 3.206896551724138,
+ "grad_norm": 0.5122736692428589,
+ "learning_rate": 2.540206363682768e-06,
+ "loss": 0.4005,
+ "step": 680
+ },
+ {
+ "epoch": 3.2115987460815045,
+ "grad_norm": 0.5179394483566284,
+ "learning_rate": 2.533505744444972e-06,
+ "loss": 0.419,
+ "step": 681
+ },
+ {
+ "epoch": 3.2163009404388716,
+ "grad_norm": 0.5541443824768066,
+ "learning_rate": 2.526804884458707e-06,
+ "loss": 0.4112,
+ "step": 682
+ },
+ {
+ "epoch": 3.2210031347962382,
+ "grad_norm": 0.5687317252159119,
+ "learning_rate": 2.520103831871591e-06,
+ "loss": 0.4145,
+ "step": 683
+ },
+ {
+ "epoch": 3.225705329153605,
+ "grad_norm": 0.5060294270515442,
+ "learning_rate": 2.513402634832627e-06,
+ "loss": 0.3933,
+ "step": 684
+ },
+ {
+ "epoch": 3.230407523510972,
+ "grad_norm": 0.6311008930206299,
+ "learning_rate": 2.5067013414918523e-06,
+ "loss": 0.401,
+ "step": 685
+ },
+ {
+ "epoch": 3.2351097178683386,
+ "grad_norm": 0.5575832724571228,
+ "learning_rate": 2.5e-06,
+ "loss": 0.4127,
+ "step": 686
+ },
+ {
+ "epoch": 3.239811912225705,
+ "grad_norm": 0.5105507373809814,
+ "learning_rate": 2.493298658508149e-06,
+ "loss": 0.3971,
+ "step": 687
+ },
+ {
+ "epoch": 3.2445141065830723,
+ "grad_norm": 0.5813129544258118,
+ "learning_rate": 2.4865973651673743e-06,
+ "loss": 0.4136,
+ "step": 688
+ },
+ {
+ "epoch": 3.249216300940439,
+ "grad_norm": 0.5921242833137512,
+ "learning_rate": 2.4798961681284096e-06,
+ "loss": 0.437,
+ "step": 689
+ },
+ {
+ "epoch": 3.2539184952978055,
+ "grad_norm": 0.5654864311218262,
+ "learning_rate": 2.473195115541293e-06,
+ "loss": 0.3939,
+ "step": 690
+ },
+ {
+ "epoch": 3.2586206896551726,
+ "grad_norm": 0.5103882551193237,
+ "learning_rate": 2.466494255555029e-06,
+ "loss": 0.4394,
+ "step": 691
+ },
+ {
+ "epoch": 3.2633228840125392,
+ "grad_norm": 0.5423967242240906,
+ "learning_rate": 2.459793636317233e-06,
+ "loss": 0.4048,
+ "step": 692
+ },
+ {
+ "epoch": 3.268025078369906,
+ "grad_norm": 0.6185951828956604,
+ "learning_rate": 2.4530933059737936e-06,
+ "loss": 0.4432,
+ "step": 693
+ },
+ {
+ "epoch": 3.2727272727272725,
+ "grad_norm": 0.6062753796577454,
+ "learning_rate": 2.4463933126685236e-06,
+ "loss": 0.4061,
+ "step": 694
+ },
+ {
+ "epoch": 3.2774294670846396,
+ "grad_norm": 0.5118281841278076,
+ "learning_rate": 2.439693704542814e-06,
+ "loss": 0.4008,
+ "step": 695
+ },
+ {
+ "epoch": 3.282131661442006,
+ "grad_norm": 0.9080231785774231,
+ "learning_rate": 2.432994529735286e-06,
+ "loss": 0.409,
+ "step": 696
+ },
+ {
+ "epoch": 3.2868338557993733,
+ "grad_norm": 0.550635814666748,
+ "learning_rate": 2.4262958363814512e-06,
+ "loss": 0.4202,
+ "step": 697
+ },
+ {
+ "epoch": 3.29153605015674,
+ "grad_norm": 0.5728116631507874,
+ "learning_rate": 2.4195976726133574e-06,
+ "loss": 0.406,
+ "step": 698
+ },
+ {
+ "epoch": 3.2962382445141065,
+ "grad_norm": 0.4995472729206085,
+ "learning_rate": 2.4129000865592517e-06,
+ "loss": 0.4063,
+ "step": 699
+ },
+ {
+ "epoch": 3.300940438871473,
+ "grad_norm": 0.601259708404541,
+ "learning_rate": 2.4062031263432267e-06,
+ "loss": 0.4268,
+ "step": 700
+ },
+ {
+ "epoch": 3.30564263322884,
+ "grad_norm": 0.570606529712677,
+ "learning_rate": 2.3995068400848785e-06,
+ "loss": 0.4034,
+ "step": 701
+ },
+ {
+ "epoch": 3.310344827586207,
+ "grad_norm": 0.5638160705566406,
+ "learning_rate": 2.392811275898963e-06,
+ "loss": 0.4212,
+ "step": 702
+ },
+ {
+ "epoch": 3.3150470219435735,
+ "grad_norm": 0.5354572534561157,
+ "learning_rate": 2.3861164818950448e-06,
+ "loss": 0.3893,
+ "step": 703
+ },
+ {
+ "epoch": 3.3197492163009406,
+ "grad_norm": 0.5149163603782654,
+ "learning_rate": 2.379422506177157e-06,
+ "loss": 0.4126,
+ "step": 704
+ },
+ {
+ "epoch": 3.324451410658307,
+ "grad_norm": 0.5132194757461548,
+ "learning_rate": 2.372729396843453e-06,
+ "loss": 0.4132,
+ "step": 705
+ },
+ {
+ "epoch": 3.329153605015674,
+ "grad_norm": 0.5163543224334717,
+ "learning_rate": 2.366037201985858e-06,
+ "loss": 0.418,
+ "step": 706
+ },
+ {
+ "epoch": 3.333855799373041,
+ "grad_norm": 0.5132508277893066,
+ "learning_rate": 2.3593459696897294e-06,
+ "loss": 0.3944,
+ "step": 707
+ },
+ {
+ "epoch": 3.3385579937304075,
+ "grad_norm": 0.5490009188652039,
+ "learning_rate": 2.352655748033508e-06,
+ "loss": 0.414,
+ "step": 708
+ },
+ {
+ "epoch": 3.343260188087774,
+ "grad_norm": 0.5879104733467102,
+ "learning_rate": 2.3459665850883704e-06,
+ "loss": 0.4344,
+ "step": 709
+ },
+ {
+ "epoch": 3.347962382445141,
+ "grad_norm": 0.5451306700706482,
+ "learning_rate": 2.33927852891789e-06,
+ "loss": 0.4208,
+ "step": 710
+ },
+ {
+ "epoch": 3.352664576802508,
+ "grad_norm": 0.5207070708274841,
+ "learning_rate": 2.3325916275776834e-06,
+ "loss": 0.4398,
+ "step": 711
+ },
+ {
+ "epoch": 3.3573667711598745,
+ "grad_norm": 0.5440477132797241,
+ "learning_rate": 2.3259059291150744e-06,
+ "loss": 0.4015,
+ "step": 712
+ },
+ {
+ "epoch": 3.3620689655172415,
+ "grad_norm": 0.5619958639144897,
+ "learning_rate": 2.319221481568739e-06,
+ "loss": 0.4196,
+ "step": 713
+ },
+ {
+ "epoch": 3.366771159874608,
+ "grad_norm": 0.6007470488548279,
+ "learning_rate": 2.3125383329683666e-06,
+ "loss": 0.4217,
+ "step": 714
+ },
+ {
+ "epoch": 3.371473354231975,
+ "grad_norm": 0.4972032904624939,
+ "learning_rate": 2.3058565313343152e-06,
+ "loss": 0.3904,
+ "step": 715
+ },
+ {
+ "epoch": 3.376175548589342,
+ "grad_norm": 0.5420966148376465,
+ "learning_rate": 2.2991761246772623e-06,
+ "loss": 0.4048,
+ "step": 716
+ },
+ {
+ "epoch": 3.3808777429467085,
+ "grad_norm": 0.520063042640686,
+ "learning_rate": 2.2924971609978623e-06,
+ "loss": 0.3965,
+ "step": 717
+ },
+ {
+ "epoch": 3.385579937304075,
+ "grad_norm": 0.8903913497924805,
+ "learning_rate": 2.285819688286403e-06,
+ "loss": 0.3873,
+ "step": 718
+ },
+ {
+ "epoch": 3.3902821316614418,
+ "grad_norm": 0.5380633473396301,
+ "learning_rate": 2.2791437545224563e-06,
+ "loss": 0.4335,
+ "step": 719
+ },
+ {
+ "epoch": 3.394984326018809,
+ "grad_norm": 0.5058356523513794,
+ "learning_rate": 2.2724694076745397e-06,
+ "loss": 0.4134,
+ "step": 720
+ },
+ {
+ "epoch": 3.3996865203761755,
+ "grad_norm": 0.5383400321006775,
+ "learning_rate": 2.265796695699766e-06,
+ "loss": 0.4154,
+ "step": 721
+ },
+ {
+ "epoch": 3.4043887147335425,
+ "grad_norm": 0.5831345319747925,
+ "learning_rate": 2.2591256665434998e-06,
+ "loss": 0.4193,
+ "step": 722
+ },
+ {
+ "epoch": 3.409090909090909,
+ "grad_norm": 0.5494023561477661,
+ "learning_rate": 2.252456368139019e-06,
+ "loss": 0.4137,
+ "step": 723
+ },
+ {
+ "epoch": 3.413793103448276,
+ "grad_norm": 0.5735755562782288,
+ "learning_rate": 2.245788848407159e-06,
+ "loss": 0.4211,
+ "step": 724
+ },
+ {
+ "epoch": 3.4184952978056424,
+ "grad_norm": 0.5244953036308289,
+ "learning_rate": 2.2391231552559815e-06,
+ "loss": 0.4194,
+ "step": 725
+ },
+ {
+ "epoch": 3.4231974921630095,
+ "grad_norm": 0.5803194642066956,
+ "learning_rate": 2.2324593365804184e-06,
+ "loss": 0.3882,
+ "step": 726
+ },
+ {
+ "epoch": 3.427899686520376,
+ "grad_norm": 0.5303656458854675,
+ "learning_rate": 2.225797440261936e-06,
+ "loss": 0.4336,
+ "step": 727
+ },
+ {
+ "epoch": 3.4326018808777428,
+ "grad_norm": 0.6270896792411804,
+ "learning_rate": 2.219137514168187e-06,
+ "loss": 0.397,
+ "step": 728
+ },
+ {
+ "epoch": 3.43730407523511,
+ "grad_norm": 0.5054409503936768,
+ "learning_rate": 2.212479606152667e-06,
+ "loss": 0.4261,
+ "step": 729
+ },
+ {
+ "epoch": 3.4420062695924765,
+ "grad_norm": 0.5422618985176086,
+ "learning_rate": 2.205823764054372e-06,
+ "loss": 0.4105,
+ "step": 730
+ },
+ {
+ "epoch": 3.446708463949843,
+ "grad_norm": 0.5200968980789185,
+ "learning_rate": 2.199170035697453e-06,
+ "loss": 0.4048,
+ "step": 731
+ },
+ {
+ "epoch": 3.45141065830721,
+ "grad_norm": 0.5316998362541199,
+ "learning_rate": 2.1925184688908735e-06,
+ "loss": 0.4132,
+ "step": 732
+ },
+ {
+ "epoch": 3.456112852664577,
+ "grad_norm": 0.5780388116836548,
+ "learning_rate": 2.185869111428067e-06,
+ "loss": 0.4381,
+ "step": 733
+ },
+ {
+ "epoch": 3.4608150470219434,
+ "grad_norm": 0.5547174215316772,
+ "learning_rate": 2.1792220110865885e-06,
+ "loss": 0.4236,
+ "step": 734
+ },
+ {
+ "epoch": 3.4655172413793105,
+ "grad_norm": 0.5188453197479248,
+ "learning_rate": 2.1725772156277795e-06,
+ "loss": 0.4052,
+ "step": 735
+ },
+ {
+ "epoch": 3.470219435736677,
+ "grad_norm": 0.5145602822303772,
+ "learning_rate": 2.165934772796417e-06,
+ "loss": 0.412,
+ "step": 736
+ },
+ {
+ "epoch": 3.4749216300940438,
+ "grad_norm": 0.5960094332695007,
+ "learning_rate": 2.159294730320374e-06,
+ "loss": 0.426,
+ "step": 737
+ },
+ {
+ "epoch": 3.479623824451411,
+ "grad_norm": 0.7090360522270203,
+ "learning_rate": 2.15265713591028e-06,
+ "loss": 0.4133,
+ "step": 738
+ },
+ {
+ "epoch": 3.4843260188087775,
+ "grad_norm": 0.5428952574729919,
+ "learning_rate": 2.1460220372591676e-06,
+ "loss": 0.4332,
+ "step": 739
+ },
+ {
+ "epoch": 3.489028213166144,
+ "grad_norm": 0.6610196232795715,
+ "learning_rate": 2.139389482042142e-06,
+ "loss": 0.3985,
+ "step": 740
+ },
+ {
+ "epoch": 3.493730407523511,
+ "grad_norm": 0.5409770607948303,
+ "learning_rate": 2.1327595179160332e-06,
+ "loss": 0.4148,
+ "step": 741
+ },
+ {
+ "epoch": 3.498432601880878,
+ "grad_norm": 0.8822159171104431,
+ "learning_rate": 2.1261321925190492e-06,
+ "loss": 0.4071,
+ "step": 742
+ },
+ {
+ "epoch": 3.5031347962382444,
+ "grad_norm": 0.5366957783699036,
+ "learning_rate": 2.1195075534704433e-06,
+ "loss": 0.3838,
+ "step": 743
+ },
+ {
+ "epoch": 3.507836990595611,
+ "grad_norm": 0.5289701819419861,
+ "learning_rate": 2.1128856483701625e-06,
+ "loss": 0.4123,
+ "step": 744
+ },
+ {
+ "epoch": 3.512539184952978,
+ "grad_norm": 0.5737835764884949,
+ "learning_rate": 2.10626652479851e-06,
+ "loss": 0.392,
+ "step": 745
+ },
+ {
+ "epoch": 3.5172413793103448,
+ "grad_norm": 0.5381962060928345,
+ "learning_rate": 2.0996502303158057e-06,
+ "loss": 0.4088,
+ "step": 746
+ },
+ {
+ "epoch": 3.521943573667712,
+ "grad_norm": 0.529466450214386,
+ "learning_rate": 2.0930368124620385e-06,
+ "loss": 0.4098,
+ "step": 747
+ },
+ {
+ "epoch": 3.5266457680250785,
+ "grad_norm": 0.6686971783638,
+ "learning_rate": 2.086426318756531e-06,
+ "loss": 0.4273,
+ "step": 748
+ },
+ {
+ "epoch": 3.531347962382445,
+ "grad_norm": 0.5246966481208801,
+ "learning_rate": 2.0798187966975917e-06,
+ "loss": 0.4318,
+ "step": 749
+ },
+ {
+ "epoch": 3.5360501567398117,
+ "grad_norm": 0.5165736675262451,
+ "learning_rate": 2.073214293762179e-06,
+ "loss": 0.4212,
+ "step": 750
+ },
+ {
+ "epoch": 3.540752351097179,
+ "grad_norm": 0.6821503043174744,
+ "learning_rate": 2.0666128574055575e-06,
+ "loss": 0.4199,
+ "step": 751
+ },
+ {
+ "epoch": 3.5454545454545454,
+ "grad_norm": 0.5294732451438904,
+ "learning_rate": 2.0600145350609585e-06,
+ "loss": 0.4192,
+ "step": 752
+ },
+ {
+ "epoch": 3.5501567398119125,
+ "grad_norm": 0.515800416469574,
+ "learning_rate": 2.053419374139235e-06,
+ "loss": 0.4172,
+ "step": 753
+ },
+ {
+ "epoch": 3.554858934169279,
+ "grad_norm": 0.5241639614105225,
+ "learning_rate": 2.0468274220285295e-06,
+ "loss": 0.4138,
+ "step": 754
+ },
+ {
+ "epoch": 3.5595611285266457,
+ "grad_norm": 0.546105146408081,
+ "learning_rate": 2.0402387260939224e-06,
+ "loss": 0.4123,
+ "step": 755
+ },
+ {
+ "epoch": 3.5642633228840124,
+ "grad_norm": 0.5261510014533997,
+ "learning_rate": 2.033653333677103e-06,
+ "loss": 0.4225,
+ "step": 756
+ },
+ {
+ "epoch": 3.5689655172413794,
+ "grad_norm": 0.5825217366218567,
+ "learning_rate": 2.02707129209602e-06,
+ "loss": 0.4042,
+ "step": 757
+ },
+ {
+ "epoch": 3.573667711598746,
+ "grad_norm": 0.5916388034820557,
+ "learning_rate": 2.0204926486445463e-06,
+ "loss": 0.4222,
+ "step": 758
+ },
+ {
+ "epoch": 3.5783699059561127,
+ "grad_norm": 0.5643376708030701,
+ "learning_rate": 2.0139174505921403e-06,
+ "loss": 0.4419,
+ "step": 759
+ },
+ {
+ "epoch": 3.58307210031348,
+ "grad_norm": 0.5426534414291382,
+ "learning_rate": 2.0073457451835e-06,
+ "loss": 0.3985,
+ "step": 760
+ },
+ {
+ "epoch": 3.5877742946708464,
+ "grad_norm": 0.48811203241348267,
+ "learning_rate": 2.0007775796382335e-06,
+ "loss": 0.4249,
+ "step": 761
+ },
+ {
+ "epoch": 3.592476489028213,
+ "grad_norm": 0.5216817855834961,
+ "learning_rate": 1.994213001150508e-06,
+ "loss": 0.3931,
+ "step": 762
+ },
+ {
+ "epoch": 3.5971786833855797,
+ "grad_norm": 0.5739433169364929,
+ "learning_rate": 1.9876520568887207e-06,
+ "loss": 0.42,
+ "step": 763
+ },
+ {
+ "epoch": 3.6018808777429467,
+ "grad_norm": 0.5166419148445129,
+ "learning_rate": 1.981094793995155e-06,
+ "loss": 0.4041,
+ "step": 764
+ },
+ {
+ "epoch": 3.6065830721003134,
+ "grad_norm": 0.6763928532600403,
+ "learning_rate": 1.974541259585641e-06,
+ "loss": 0.4319,
+ "step": 765
+ },
+ {
+ "epoch": 3.6112852664576804,
+ "grad_norm": 0.5443664789199829,
+ "learning_rate": 1.9679915007492194e-06,
+ "loss": 0.4139,
+ "step": 766
+ },
+ {
+ "epoch": 3.615987460815047,
+ "grad_norm": 0.6719280481338501,
+ "learning_rate": 1.9614455645478047e-06,
+ "loss": 0.4015,
+ "step": 767
+ },
+ {
+ "epoch": 3.6206896551724137,
+ "grad_norm": 0.5685383677482605,
+ "learning_rate": 1.9549034980158403e-06,
+ "loss": 0.4153,
+ "step": 768
+ },
+ {
+ "epoch": 3.6253918495297803,
+ "grad_norm": 0.5463993549346924,
+ "learning_rate": 1.9483653481599697e-06,
+ "loss": 0.4193,
+ "step": 769
+ },
+ {
+ "epoch": 3.6300940438871474,
+ "grad_norm": 0.5228095054626465,
+ "learning_rate": 1.9418311619586897e-06,
+ "loss": 0.4268,
+ "step": 770
+ },
+ {
+ "epoch": 3.634796238244514,
+ "grad_norm": 0.6472461223602295,
+ "learning_rate": 1.935300986362018e-06,
+ "loss": 0.3981,
+ "step": 771
+ },
+ {
+ "epoch": 3.639498432601881,
+ "grad_norm": 0.61808842420578,
+ "learning_rate": 1.9287748682911582e-06,
+ "loss": 0.4313,
+ "step": 772
+ },
+ {
+ "epoch": 3.6442006269592477,
+ "grad_norm": 0.5122710466384888,
+ "learning_rate": 1.9222528546381543e-06,
+ "loss": 0.4219,
+ "step": 773
+ },
+ {
+ "epoch": 3.6489028213166144,
+ "grad_norm": 0.5540320873260498,
+ "learning_rate": 1.9157349922655648e-06,
+ "loss": 0.4001,
+ "step": 774
+ },
+ {
+ "epoch": 3.653605015673981,
+ "grad_norm": 0.5066401958465576,
+ "learning_rate": 1.909221328006114e-06,
+ "loss": 0.4089,
+ "step": 775
+ },
+ {
+ "epoch": 3.658307210031348,
+ "grad_norm": 0.5802583694458008,
+ "learning_rate": 1.9027119086623647e-06,
+ "loss": 0.4216,
+ "step": 776
+ },
+ {
+ "epoch": 3.6630094043887147,
+ "grad_norm": 0.5735054016113281,
+ "learning_rate": 1.8962067810063806e-06,
+ "loss": 0.4372,
+ "step": 777
+ },
+ {
+ "epoch": 3.6677115987460818,
+ "grad_norm": 0.5177802443504333,
+ "learning_rate": 1.8897059917793844e-06,
+ "loss": 0.3912,
+ "step": 778
+ },
+ {
+ "epoch": 3.6724137931034484,
+ "grad_norm": 0.5790892243385315,
+ "learning_rate": 1.8832095876914268e-06,
+ "loss": 0.4096,
+ "step": 779
+ },
+ {
+ "epoch": 3.677115987460815,
+ "grad_norm": 0.5386017560958862,
+ "learning_rate": 1.8767176154210537e-06,
+ "loss": 0.4191,
+ "step": 780
+ },
+ {
+ "epoch": 3.6818181818181817,
+ "grad_norm": 0.5927474498748779,
+ "learning_rate": 1.8702301216149616e-06,
+ "loss": 0.4061,
+ "step": 781
+ },
+ {
+ "epoch": 3.6865203761755487,
+ "grad_norm": 0.5609317421913147,
+ "learning_rate": 1.8637471528876727e-06,
+ "loss": 0.4067,
+ "step": 782
+ },
+ {
+ "epoch": 3.6912225705329154,
+ "grad_norm": 0.6609043478965759,
+ "learning_rate": 1.8572687558211923e-06,
+ "loss": 0.4183,
+ "step": 783
+ },
+ {
+ "epoch": 3.695924764890282,
+ "grad_norm": 0.5092527270317078,
+ "learning_rate": 1.850794976964677e-06,
+ "loss": 0.3827,
+ "step": 784
+ },
+ {
+ "epoch": 3.700626959247649,
+ "grad_norm": 0.8918034434318542,
+ "learning_rate": 1.8443258628341026e-06,
+ "loss": 0.4144,
+ "step": 785
+ },
+ {
+ "epoch": 3.7053291536050157,
+ "grad_norm": 0.5443233847618103,
+ "learning_rate": 1.837861459911925e-06,
+ "loss": 0.4246,
+ "step": 786
+ },
+ {
+ "epoch": 3.7100313479623823,
+ "grad_norm": 0.6559080481529236,
+ "learning_rate": 1.8314018146467505e-06,
+ "loss": 0.4067,
+ "step": 787
+ },
+ {
+ "epoch": 3.714733542319749,
+ "grad_norm": 0.5071741342544556,
+ "learning_rate": 1.8249469734529995e-06,
+ "loss": 0.3888,
+ "step": 788
+ },
+ {
+ "epoch": 3.719435736677116,
+ "grad_norm": 0.5663676261901855,
+ "learning_rate": 1.818496982710572e-06,
+ "loss": 0.4256,
+ "step": 789
+ },
+ {
+ "epoch": 3.7241379310344827,
+ "grad_norm": 0.5477777719497681,
+ "learning_rate": 1.81205188876452e-06,
+ "loss": 0.423,
+ "step": 790
+ },
+ {
+ "epoch": 3.7288401253918497,
+ "grad_norm": 0.5709276795387268,
+ "learning_rate": 1.8056117379247078e-06,
+ "loss": 0.4265,
+ "step": 791
+ },
+ {
+ "epoch": 3.7335423197492164,
+ "grad_norm": 0.49602681398391724,
+ "learning_rate": 1.7991765764654813e-06,
+ "loss": 0.4141,
+ "step": 792
+ },
+ {
+ "epoch": 3.738244514106583,
+ "grad_norm": 0.5358700156211853,
+ "learning_rate": 1.7927464506253394e-06,
+ "loss": 0.4231,
+ "step": 793
+ },
+ {
+ "epoch": 3.7429467084639496,
+ "grad_norm": 1.1592613458633423,
+ "learning_rate": 1.7863214066065951e-06,
+ "loss": 0.3929,
+ "step": 794
+ },
+ {
+ "epoch": 3.7476489028213167,
+ "grad_norm": 0.5176786780357361,
+ "learning_rate": 1.779901490575051e-06,
+ "loss": 0.4201,
+ "step": 795
+ },
+ {
+ "epoch": 3.7523510971786833,
+ "grad_norm": 0.5303675532341003,
+ "learning_rate": 1.7734867486596596e-06,
+ "loss": 0.4201,
+ "step": 796
+ },
+ {
+ "epoch": 3.7570532915360504,
+ "grad_norm": 0.5633402466773987,
+ "learning_rate": 1.767077226952198e-06,
+ "loss": 0.4276,
+ "step": 797
+ },
+ {
+ "epoch": 3.761755485893417,
+ "grad_norm": 0.6016635894775391,
+ "learning_rate": 1.7606729715069349e-06,
+ "loss": 0.4143,
+ "step": 798
+ },
+ {
+ "epoch": 3.7664576802507836,
+ "grad_norm": 0.5202106237411499,
+ "learning_rate": 1.7542740283402981e-06,
+ "loss": 0.4195,
+ "step": 799
+ },
+ {
+ "epoch": 3.7711598746081503,
+ "grad_norm": 0.6279420852661133,
+ "learning_rate": 1.7478804434305466e-06,
+ "loss": 0.4001,
+ "step": 800
+ },
+ {
+ "epoch": 3.7758620689655173,
+ "grad_norm": 0.5253601670265198,
+ "learning_rate": 1.741492262717438e-06,
+ "loss": 0.4206,
+ "step": 801
+ },
+ {
+ "epoch": 3.780564263322884,
+ "grad_norm": 0.5218167901039124,
+ "learning_rate": 1.7351095321018974e-06,
+ "loss": 0.387,
+ "step": 802
+ },
+ {
+ "epoch": 3.785266457680251,
+ "grad_norm": 0.530846357345581,
+ "learning_rate": 1.7287322974456933e-06,
+ "loss": 0.3935,
+ "step": 803
+ },
+ {
+ "epoch": 3.7899686520376177,
+ "grad_norm": 0.5487862825393677,
+ "learning_rate": 1.7223606045711006e-06,
+ "loss": 0.4168,
+ "step": 804
+ },
+ {
+ "epoch": 3.7946708463949843,
+ "grad_norm": 0.5345083475112915,
+ "learning_rate": 1.7159944992605774e-06,
+ "loss": 0.4208,
+ "step": 805
+ },
+ {
+ "epoch": 3.799373040752351,
+ "grad_norm": 0.5425072312355042,
+ "learning_rate": 1.7096340272564318e-06,
+ "loss": 0.4088,
+ "step": 806
+ },
+ {
+ "epoch": 3.804075235109718,
+ "grad_norm": 0.5253011584281921,
+ "learning_rate": 1.7032792342604947e-06,
+ "loss": 0.3995,
+ "step": 807
+ },
+ {
+ "epoch": 3.8087774294670846,
+ "grad_norm": 0.7746017575263977,
+ "learning_rate": 1.6969301659337944e-06,
+ "loss": 0.4145,
+ "step": 808
+ },
+ {
+ "epoch": 3.8134796238244513,
+ "grad_norm": 0.7049569487571716,
+ "learning_rate": 1.6905868678962225e-06,
+ "loss": 0.4216,
+ "step": 809
+ },
+ {
+ "epoch": 3.8181818181818183,
+ "grad_norm": 0.602180540561676,
+ "learning_rate": 1.684249385726211e-06,
+ "loss": 0.4134,
+ "step": 810
+ },
+ {
+ "epoch": 3.822884012539185,
+ "grad_norm": 0.5291408896446228,
+ "learning_rate": 1.677917764960404e-06,
+ "loss": 0.402,
+ "step": 811
+ },
+ {
+ "epoch": 3.8275862068965516,
+ "grad_norm": 0.5529280304908752,
+ "learning_rate": 1.6715920510933277e-06,
+ "loss": 0.4322,
+ "step": 812
+ },
+ {
+ "epoch": 3.8322884012539182,
+ "grad_norm": 0.5989758968353271,
+ "learning_rate": 1.6652722895770676e-06,
+ "loss": 0.4275,
+ "step": 813
+ },
+ {
+ "epoch": 3.8369905956112853,
+ "grad_norm": 0.5088624358177185,
+ "learning_rate": 1.6589585258209383e-06,
+ "loss": 0.378,
+ "step": 814
+ },
+ {
+ "epoch": 3.841692789968652,
+ "grad_norm": 0.5167607665061951,
+ "learning_rate": 1.6526508051911588e-06,
+ "loss": 0.4221,
+ "step": 815
+ },
+ {
+ "epoch": 3.846394984326019,
+ "grad_norm": 0.5582865476608276,
+ "learning_rate": 1.6463491730105282e-06,
+ "loss": 0.4091,
+ "step": 816
+ },
+ {
+ "epoch": 3.8510971786833856,
+ "grad_norm": 0.5103083252906799,
+ "learning_rate": 1.6400536745580955e-06,
+ "loss": 0.3867,
+ "step": 817
+ },
+ {
+ "epoch": 3.8557993730407523,
+ "grad_norm": 0.528692901134491,
+ "learning_rate": 1.6337643550688408e-06,
+ "loss": 0.4178,
+ "step": 818
+ },
+ {
+ "epoch": 3.860501567398119,
+ "grad_norm": 0.5174258947372437,
+ "learning_rate": 1.627481259733343e-06,
+ "loss": 0.3989,
+ "step": 819
+ },
+ {
+ "epoch": 3.865203761755486,
+ "grad_norm": 0.492735892534256,
+ "learning_rate": 1.6212044336974598e-06,
+ "loss": 0.3935,
+ "step": 820
+ },
+ {
+ "epoch": 3.8699059561128526,
+ "grad_norm": 0.5810956954956055,
+ "learning_rate": 1.614933922062003e-06,
+ "loss": 0.4082,
+ "step": 821
+ },
+ {
+ "epoch": 3.8746081504702197,
+ "grad_norm": 0.5235511660575867,
+ "learning_rate": 1.6086697698824144e-06,
+ "loss": 0.4026,
+ "step": 822
+ },
+ {
+ "epoch": 3.8793103448275863,
+ "grad_norm": 0.5972744822502136,
+ "learning_rate": 1.6024120221684373e-06,
+ "loss": 0.4018,
+ "step": 823
+ },
+ {
+ "epoch": 3.884012539184953,
+ "grad_norm": 0.5685083270072937,
+ "learning_rate": 1.5961607238838022e-06,
+ "loss": 0.4077,
+ "step": 824
+ },
+ {
+ "epoch": 3.8887147335423196,
+ "grad_norm": 0.5427765250205994,
+ "learning_rate": 1.589915919945894e-06,
+ "loss": 0.4187,
+ "step": 825
+ },
+ {
+ "epoch": 3.8934169278996866,
+ "grad_norm": 0.6297295093536377,
+ "learning_rate": 1.5836776552254386e-06,
+ "loss": 0.4367,
+ "step": 826
+ },
+ {
+ "epoch": 3.8981191222570533,
+ "grad_norm": 0.6110124588012695,
+ "learning_rate": 1.5774459745461711e-06,
+ "loss": 0.4065,
+ "step": 827
+ },
+ {
+ "epoch": 3.9028213166144203,
+ "grad_norm": 0.4981592297554016,
+ "learning_rate": 1.5712209226845201e-06,
+ "loss": 0.3836,
+ "step": 828
+ },
+ {
+ "epoch": 3.907523510971787,
+ "grad_norm": 0.5722451210021973,
+ "learning_rate": 1.565002544369286e-06,
+ "loss": 0.4161,
+ "step": 829
+ },
+ {
+ "epoch": 3.9122257053291536,
+ "grad_norm": 0.6718733310699463,
+ "learning_rate": 1.5587908842813142e-06,
+ "loss": 0.4053,
+ "step": 830
+ },
+ {
+ "epoch": 3.91692789968652,
+ "grad_norm": 0.5070095658302307,
+ "learning_rate": 1.5525859870531823e-06,
+ "loss": 0.4198,
+ "step": 831
+ },
+ {
+ "epoch": 3.9216300940438873,
+ "grad_norm": 0.5303407311439514,
+ "learning_rate": 1.5463878972688707e-06,
+ "loss": 0.4089,
+ "step": 832
+ },
+ {
+ "epoch": 3.926332288401254,
+ "grad_norm": 0.5431908369064331,
+ "learning_rate": 1.5401966594634483e-06,
+ "loss": 0.4341,
+ "step": 833
+ },
+ {
+ "epoch": 3.9310344827586206,
+ "grad_norm": 0.549174427986145,
+ "learning_rate": 1.5340123181227495e-06,
+ "loss": 0.4237,
+ "step": 834
+ },
+ {
+ "epoch": 3.9357366771159876,
+ "grad_norm": 0.8902267217636108,
+ "learning_rate": 1.527834917683058e-06,
+ "loss": 0.3904,
+ "step": 835
+ },
+ {
+ "epoch": 3.9404388714733543,
+ "grad_norm": 0.5055849552154541,
+ "learning_rate": 1.5216645025307813e-06,
+ "loss": 0.4058,
+ "step": 836
+ },
+ {
+ "epoch": 3.945141065830721,
+ "grad_norm": 0.5319788455963135,
+ "learning_rate": 1.5155011170021399e-06,
+ "loss": 0.4153,
+ "step": 837
+ },
+ {
+ "epoch": 3.9498432601880875,
+ "grad_norm": 0.5441375374794006,
+ "learning_rate": 1.5093448053828402e-06,
+ "loss": 0.4231,
+ "step": 838
+ },
+ {
+ "epoch": 3.9545454545454546,
+ "grad_norm": 0.5940942764282227,
+ "learning_rate": 1.503195611907764e-06,
+ "loss": 0.4241,
+ "step": 839
+ },
+ {
+ "epoch": 3.959247648902821,
+ "grad_norm": 0.5203325748443604,
+ "learning_rate": 1.4970535807606453e-06,
+ "loss": 0.3842,
+ "step": 840
+ },
+ {
+ "epoch": 3.9639498432601883,
+ "grad_norm": 0.525404691696167,
+ "learning_rate": 1.4909187560737542e-06,
+ "loss": 0.3954,
+ "step": 841
+ },
+ {
+ "epoch": 3.968652037617555,
+ "grad_norm": 0.5999636054039001,
+ "learning_rate": 1.4847911819275829e-06,
+ "loss": 0.4061,
+ "step": 842
+ },
+ {
+ "epoch": 3.9733542319749215,
+ "grad_norm": 0.5253078937530518,
+ "learning_rate": 1.4786709023505224e-06,
+ "loss": 0.3969,
+ "step": 843
+ },
+ {
+ "epoch": 3.978056426332288,
+ "grad_norm": 0.535467803478241,
+ "learning_rate": 1.4725579613185549e-06,
+ "loss": 0.4241,
+ "step": 844
+ },
+ {
+ "epoch": 3.9827586206896552,
+ "grad_norm": 0.5458933711051941,
+ "learning_rate": 1.4664524027549291e-06,
+ "loss": 0.4102,
+ "step": 845
+ },
+ {
+ "epoch": 3.987460815047022,
+ "grad_norm": 0.515102207660675,
+ "learning_rate": 1.4603542705298493e-06,
+ "loss": 0.3957,
+ "step": 846
+ },
+ {
+ "epoch": 3.992163009404389,
+ "grad_norm": 0.572600245475769,
+ "learning_rate": 1.4542636084601624e-06,
+ "loss": 0.3686,
+ "step": 847
+ },
+ {
+ "epoch": 3.9968652037617556,
+ "grad_norm": 0.520165205001831,
+ "learning_rate": 1.4481804603090358e-06,
+ "loss": 0.4109,
+ "step": 848
+ },
+ {
+ "epoch": 4.004702194357367,
+ "grad_norm": 0.9280151128768921,
+ "learning_rate": 1.4421048697856494e-06,
+ "loss": 0.7531,
+ "step": 849
+ },
+ {
+ "epoch": 4.009404388714733,
+ "grad_norm": 0.5386480093002319,
+ "learning_rate": 1.4360368805448788e-06,
+ "loss": 0.3782,
+ "step": 850
+ },
+ {
+ "epoch": 4.0141065830721,
+ "grad_norm": 0.5074192881584167,
+ "learning_rate": 1.4299765361869837e-06,
+ "loss": 0.3971,
+ "step": 851
+ },
+ {
+ "epoch": 4.018808777429467,
+ "grad_norm": 0.55893874168396,
+ "learning_rate": 1.4239238802572908e-06,
+ "loss": 0.3553,
+ "step": 852
+ },
+ {
+ "epoch": 4.023510971786834,
+ "grad_norm": 0.5474048852920532,
+ "learning_rate": 1.4178789562458847e-06,
+ "loss": 0.3953,
+ "step": 853
+ },
+ {
+ "epoch": 4.028213166144201,
+ "grad_norm": 0.5103669166564941,
+ "learning_rate": 1.4118418075872936e-06,
+ "loss": 0.3801,
+ "step": 854
+ },
+ {
+ "epoch": 4.032915360501567,
+ "grad_norm": 0.48109811544418335,
+ "learning_rate": 1.405812477660178e-06,
+ "loss": 0.3786,
+ "step": 855
+ },
+ {
+ "epoch": 4.037617554858934,
+ "grad_norm": 0.6493998765945435,
+ "learning_rate": 1.3997910097870165e-06,
+ "loss": 0.4014,
+ "step": 856
+ },
+ {
+ "epoch": 4.0423197492163006,
+ "grad_norm": 0.5369696617126465,
+ "learning_rate": 1.3937774472337994e-06,
+ "loss": 0.4058,
+ "step": 857
+ },
+ {
+ "epoch": 4.047021943573668,
+ "grad_norm": 0.5302414894104004,
+ "learning_rate": 1.3877718332097146e-06,
+ "loss": 0.3923,
+ "step": 858
+ },
+ {
+ "epoch": 4.051724137931035,
+ "grad_norm": 0.652701199054718,
+ "learning_rate": 1.3817742108668333e-06,
+ "loss": 0.3972,
+ "step": 859
+ },
+ {
+ "epoch": 4.056426332288401,
+ "grad_norm": 0.5448158979415894,
+ "learning_rate": 1.3757846232998118e-06,
+ "loss": 0.3378,
+ "step": 860
+ },
+ {
+ "epoch": 4.061128526645768,
+ "grad_norm": 0.5433962345123291,
+ "learning_rate": 1.369803113545566e-06,
+ "loss": 0.4121,
+ "step": 861
+ },
+ {
+ "epoch": 4.065830721003135,
+ "grad_norm": 0.5282460451126099,
+ "learning_rate": 1.3638297245829762e-06,
+ "loss": 0.4061,
+ "step": 862
+ },
+ {
+ "epoch": 4.070532915360501,
+ "grad_norm": 0.5211827754974365,
+ "learning_rate": 1.3578644993325701e-06,
+ "loss": 0.4047,
+ "step": 863
+ },
+ {
+ "epoch": 4.075235109717869,
+ "grad_norm": 0.5428538918495178,
+ "learning_rate": 1.3519074806562165e-06,
+ "loss": 0.3947,
+ "step": 864
+ },
+ {
+ "epoch": 4.079937304075235,
+ "grad_norm": 0.5352445244789124,
+ "learning_rate": 1.3459587113568208e-06,
+ "loss": 0.3947,
+ "step": 865
+ },
+ {
+ "epoch": 4.084639498432602,
+ "grad_norm": 0.5329545140266418,
+ "learning_rate": 1.340018234178009e-06,
+ "loss": 0.3987,
+ "step": 866
+ },
+ {
+ "epoch": 4.089341692789969,
+ "grad_norm": 0.5108675956726074,
+ "learning_rate": 1.3340860918038295e-06,
+ "loss": 0.3627,
+ "step": 867
+ },
+ {
+ "epoch": 4.094043887147335,
+ "grad_norm": 0.5213317275047302,
+ "learning_rate": 1.328162326858442e-06,
+ "loss": 0.388,
+ "step": 868
+ },
+ {
+ "epoch": 4.098746081504702,
+ "grad_norm": 0.5299095511436462,
+ "learning_rate": 1.3222469819058112e-06,
+ "loss": 0.3975,
+ "step": 869
+ },
+ {
+ "epoch": 4.103448275862069,
+ "grad_norm": 0.5315486788749695,
+ "learning_rate": 1.3163400994494025e-06,
+ "loss": 0.3989,
+ "step": 870
+ },
+ {
+ "epoch": 4.108150470219436,
+ "grad_norm": 0.614090621471405,
+ "learning_rate": 1.3104417219318762e-06,
+ "loss": 0.3848,
+ "step": 871
+ },
+ {
+ "epoch": 4.112852664576803,
+ "grad_norm": 0.5592188239097595,
+ "learning_rate": 1.3045518917347791e-06,
+ "loss": 0.3928,
+ "step": 872
+ },
+ {
+ "epoch": 4.117554858934169,
+ "grad_norm": 0.551544725894928,
+ "learning_rate": 1.2986706511782476e-06,
+ "loss": 0.3878,
+ "step": 873
+ },
+ {
+ "epoch": 4.122257053291536,
+ "grad_norm": 0.5453651547431946,
+ "learning_rate": 1.2927980425206968e-06,
+ "loss": 0.391,
+ "step": 874
+ },
+ {
+ "epoch": 4.1269592476489025,
+ "grad_norm": 0.5152665972709656,
+ "learning_rate": 1.2869341079585184e-06,
+ "loss": 0.391,
+ "step": 875
+ },
+ {
+ "epoch": 4.131661442006269,
+ "grad_norm": 0.5348275303840637,
+ "learning_rate": 1.2810788896257804e-06,
+ "loss": 0.3603,
+ "step": 876
+ },
+ {
+ "epoch": 4.136363636363637,
+ "grad_norm": 0.7541768550872803,
+ "learning_rate": 1.2752324295939178e-06,
+ "loss": 0.3979,
+ "step": 877
+ },
+ {
+ "epoch": 4.141065830721003,
+ "grad_norm": 0.5464813709259033,
+ "learning_rate": 1.2693947698714409e-06,
+ "loss": 0.4174,
+ "step": 878
+ },
+ {
+ "epoch": 4.14576802507837,
+ "grad_norm": 0.527622401714325,
+ "learning_rate": 1.263565952403622e-06,
+ "loss": 0.3854,
+ "step": 879
+ },
+ {
+ "epoch": 4.150470219435737,
+ "grad_norm": 0.5733200311660767,
+ "learning_rate": 1.2577460190722013e-06,
+ "loss": 0.3989,
+ "step": 880
+ },
+ {
+ "epoch": 4.155172413793103,
+ "grad_norm": 0.587824285030365,
+ "learning_rate": 1.2519350116950842e-06,
+ "loss": 0.4014,
+ "step": 881
+ },
+ {
+ "epoch": 4.15987460815047,
+ "grad_norm": 0.5412627458572388,
+ "learning_rate": 1.2461329720260403e-06,
+ "loss": 0.3861,
+ "step": 882
+ },
+ {
+ "epoch": 4.164576802507837,
+ "grad_norm": 0.5781810283660889,
+ "learning_rate": 1.2403399417544033e-06,
+ "loss": 0.3977,
+ "step": 883
+ },
+ {
+ "epoch": 4.169278996865204,
+ "grad_norm": 0.5613389015197754,
+ "learning_rate": 1.2345559625047718e-06,
+ "loss": 0.404,
+ "step": 884
+ },
+ {
+ "epoch": 4.173981191222571,
+ "grad_norm": 0.5975982546806335,
+ "learning_rate": 1.2287810758367104e-06,
+ "loss": 0.4085,
+ "step": 885
+ },
+ {
+ "epoch": 4.178683385579937,
+ "grad_norm": 0.6386556029319763,
+ "learning_rate": 1.2230153232444511e-06,
+ "loss": 0.3885,
+ "step": 886
+ },
+ {
+ "epoch": 4.183385579937304,
+ "grad_norm": 0.5472486019134521,
+ "learning_rate": 1.217258746156594e-06,
+ "loss": 0.3806,
+ "step": 887
+ },
+ {
+ "epoch": 4.1880877742946705,
+ "grad_norm": 0.7230023145675659,
+ "learning_rate": 1.2115113859358118e-06,
+ "loss": 0.3846,
+ "step": 888
+ },
+ {
+ "epoch": 4.192789968652038,
+ "grad_norm": 0.5451399683952332,
+ "learning_rate": 1.2057732838785514e-06,
+ "loss": 0.3681,
+ "step": 889
+ },
+ {
+ "epoch": 4.197492163009405,
+ "grad_norm": 0.6396780610084534,
+ "learning_rate": 1.2000444812147333e-06,
+ "loss": 0.3812,
+ "step": 890
+ },
+ {
+ "epoch": 4.202194357366771,
+ "grad_norm": 0.6256916522979736,
+ "learning_rate": 1.1943250191074664e-06,
+ "loss": 0.4002,
+ "step": 891
+ },
+ {
+ "epoch": 4.206896551724138,
+ "grad_norm": 0.5897160768508911,
+ "learning_rate": 1.188614938652738e-06,
+ "loss": 0.4073,
+ "step": 892
+ },
+ {
+ "epoch": 4.2115987460815045,
+ "grad_norm": 0.5560889840126038,
+ "learning_rate": 1.1829142808791294e-06,
+ "loss": 0.3689,
+ "step": 893
+ },
+ {
+ "epoch": 4.216300940438871,
+ "grad_norm": 0.5476351380348206,
+ "learning_rate": 1.177223086747516e-06,
+ "loss": 0.3795,
+ "step": 894
+ },
+ {
+ "epoch": 4.221003134796238,
+ "grad_norm": 0.5640100240707397,
+ "learning_rate": 1.1715413971507747e-06,
+ "loss": 0.3935,
+ "step": 895
+ },
+ {
+ "epoch": 4.225705329153605,
+ "grad_norm": 0.5437642335891724,
+ "learning_rate": 1.1658692529134888e-06,
+ "loss": 0.3791,
+ "step": 896
+ },
+ {
+ "epoch": 4.230407523510972,
+ "grad_norm": 0.7564667463302612,
+ "learning_rate": 1.1602066947916565e-06,
+ "loss": 0.4002,
+ "step": 897
+ },
+ {
+ "epoch": 4.235109717868339,
+ "grad_norm": 0.5328983664512634,
+ "learning_rate": 1.154553763472396e-06,
+ "loss": 0.3495,
+ "step": 898
+ },
+ {
+ "epoch": 4.239811912225705,
+ "grad_norm": 0.5688467025756836,
+ "learning_rate": 1.1489104995736543e-06,
+ "loss": 0.3807,
+ "step": 899
+ },
+ {
+ "epoch": 4.244514106583072,
+ "grad_norm": 0.5422545075416565,
+ "learning_rate": 1.1432769436439162e-06,
+ "loss": 0.3955,
+ "step": 900
+ },
+ {
+ "epoch": 4.2492163009404385,
+ "grad_norm": 0.5231274366378784,
+ "learning_rate": 1.1376531361619105e-06,
+ "loss": 0.4035,
+ "step": 901
+ },
+ {
+ "epoch": 4.253918495297806,
+ "grad_norm": 0.578623354434967,
+ "learning_rate": 1.1320391175363225e-06,
+ "loss": 0.3796,
+ "step": 902
+ },
+ {
+ "epoch": 4.258620689655173,
+ "grad_norm": 0.5331007838249207,
+ "learning_rate": 1.126434928105497e-06,
+ "loss": 0.3841,
+ "step": 903
+ },
+ {
+ "epoch": 4.263322884012539,
+ "grad_norm": 0.5077575445175171,
+ "learning_rate": 1.1208406081371612e-06,
+ "loss": 0.386,
+ "step": 904
+ },
+ {
+ "epoch": 4.268025078369906,
+ "grad_norm": 0.5260904431343079,
+ "learning_rate": 1.11525619782812e-06,
+ "loss": 0.4004,
+ "step": 905
+ },
+ {
+ "epoch": 4.2727272727272725,
+ "grad_norm": 0.5973961353302002,
+ "learning_rate": 1.1096817373039773e-06,
+ "loss": 0.4038,
+ "step": 906
+ },
+ {
+ "epoch": 4.277429467084639,
+ "grad_norm": 0.5325058102607727,
+ "learning_rate": 1.104117266618846e-06,
+ "loss": 0.3961,
+ "step": 907
+ },
+ {
+ "epoch": 4.282131661442007,
+ "grad_norm": 0.5536799430847168,
+ "learning_rate": 1.0985628257550575e-06,
+ "loss": 0.3844,
+ "step": 908
+ },
+ {
+ "epoch": 4.286833855799373,
+ "grad_norm": 0.6204715371131897,
+ "learning_rate": 1.0930184546228769e-06,
+ "loss": 0.3916,
+ "step": 909
+ },
+ {
+ "epoch": 4.29153605015674,
+ "grad_norm": 0.5359520316123962,
+ "learning_rate": 1.087484193060215e-06,
+ "loss": 0.3612,
+ "step": 910
+ },
+ {
+ "epoch": 4.2962382445141065,
+ "grad_norm": 0.7552776336669922,
+ "learning_rate": 1.0819600808323424e-06,
+ "loss": 0.3986,
+ "step": 911
+ },
+ {
+ "epoch": 4.300940438871473,
+ "grad_norm": 0.545625627040863,
+ "learning_rate": 1.0764461576316041e-06,
+ "loss": 0.3829,
+ "step": 912
+ },
+ {
+ "epoch": 4.30564263322884,
+ "grad_norm": 0.5795807838439941,
+ "learning_rate": 1.0709424630771333e-06,
+ "loss": 0.3985,
+ "step": 913
+ },
+ {
+ "epoch": 4.310344827586207,
+ "grad_norm": 0.621943473815918,
+ "learning_rate": 1.0654490367145684e-06,
+ "loss": 0.3882,
+ "step": 914
+ },
+ {
+ "epoch": 4.315047021943574,
+ "grad_norm": 0.5678103566169739,
+ "learning_rate": 1.0599659180157678e-06,
+ "loss": 0.4061,
+ "step": 915
+ },
+ {
+ "epoch": 4.3197492163009406,
+ "grad_norm": 0.5638558268547058,
+ "learning_rate": 1.0544931463785237e-06,
+ "loss": 0.4247,
+ "step": 916
+ },
+ {
+ "epoch": 4.324451410658307,
+ "grad_norm": 0.5709723234176636,
+ "learning_rate": 1.049030761126287e-06,
+ "loss": 0.4002,
+ "step": 917
+ },
+ {
+ "epoch": 4.329153605015674,
+ "grad_norm": 0.5887544751167297,
+ "learning_rate": 1.043578801507874e-06,
+ "loss": 0.381,
+ "step": 918
+ },
+ {
+ "epoch": 4.33385579937304,
+ "grad_norm": 0.5499666929244995,
+ "learning_rate": 1.038137306697193e-06,
+ "loss": 0.4029,
+ "step": 919
+ },
+ {
+ "epoch": 4.338557993730408,
+ "grad_norm": 0.676122784614563,
+ "learning_rate": 1.0327063157929582e-06,
+ "loss": 0.3925,
+ "step": 920
+ },
+ {
+ "epoch": 4.343260188087775,
+ "grad_norm": 0.5894976258277893,
+ "learning_rate": 1.027285867818411e-06,
+ "loss": 0.3945,
+ "step": 921
+ },
+ {
+ "epoch": 4.347962382445141,
+ "grad_norm": 0.9533663392066956,
+ "learning_rate": 1.021876001721039e-06,
+ "loss": 0.3402,
+ "step": 922
+ },
+ {
+ "epoch": 4.352664576802508,
+ "grad_norm": 0.5602714419364929,
+ "learning_rate": 1.016476756372295e-06,
+ "loss": 0.3901,
+ "step": 923
+ },
+ {
+ "epoch": 4.3573667711598745,
+ "grad_norm": 0.5252093076705933,
+ "learning_rate": 1.011088170567319e-06,
+ "loss": 0.3807,
+ "step": 924
+ },
+ {
+ "epoch": 4.362068965517241,
+ "grad_norm": 0.5782448053359985,
+ "learning_rate": 1.0057102830246596e-06,
+ "loss": 0.373,
+ "step": 925
+ },
+ {
+ "epoch": 4.366771159874608,
+ "grad_norm": 0.5740293264389038,
+ "learning_rate": 1.0003431323859943e-06,
+ "loss": 0.4013,
+ "step": 926
+ },
+ {
+ "epoch": 4.371473354231975,
+ "grad_norm": 0.5553807616233826,
+ "learning_rate": 9.949867572158544e-07,
+ "loss": 0.3909,
+ "step": 927
+ },
+ {
+ "epoch": 4.376175548589342,
+ "grad_norm": 0.5707646012306213,
+ "learning_rate": 9.896411960013455e-07,
+ "loss": 0.4001,
+ "step": 928
+ },
+ {
+ "epoch": 4.3808777429467085,
+ "grad_norm": 0.6075118184089661,
+ "learning_rate": 9.843064871518694e-07,
+ "loss": 0.3815,
+ "step": 929
+ },
+ {
+ "epoch": 4.385579937304075,
+ "grad_norm": 0.535280168056488,
+ "learning_rate": 9.78982668998856e-07,
+ "loss": 0.3741,
+ "step": 930
+ },
+ {
+ "epoch": 4.390282131661442,
+ "grad_norm": 0.5094203352928162,
+ "learning_rate": 9.736697797954766e-07,
+ "loss": 0.4004,
+ "step": 931
+ },
+ {
+ "epoch": 4.394984326018808,
+ "grad_norm": 0.5600079298019409,
+ "learning_rate": 9.683678577163788e-07,
+ "loss": 0.3935,
+ "step": 932
+ },
+ {
+ "epoch": 4.399686520376176,
+ "grad_norm": 0.5435491800308228,
+ "learning_rate": 9.630769408574065e-07,
+ "loss": 0.3676,
+ "step": 933
+ },
+ {
+ "epoch": 4.4043887147335425,
+ "grad_norm": 0.5918356776237488,
+ "learning_rate": 9.577970672353274e-07,
+ "loss": 0.373,
+ "step": 934
+ },
+ {
+ "epoch": 4.409090909090909,
+ "grad_norm": 0.547618567943573,
+ "learning_rate": 9.525282747875636e-07,
+ "loss": 0.3674,
+ "step": 935
+ },
+ {
+ "epoch": 4.413793103448276,
+ "grad_norm": 0.6398045420646667,
+ "learning_rate": 9.472706013719113e-07,
+ "loss": 0.3947,
+ "step": 936
+ },
+ {
+ "epoch": 4.418495297805642,
+ "grad_norm": 0.5805232524871826,
+ "learning_rate": 9.420240847662759e-07,
+ "loss": 0.3803,
+ "step": 937
+ },
+ {
+ "epoch": 4.423197492163009,
+ "grad_norm": 0.5517405867576599,
+ "learning_rate": 9.367887626683975e-07,
+ "loss": 0.4065,
+ "step": 938
+ },
+ {
+ "epoch": 4.427899686520377,
+ "grad_norm": 0.563588559627533,
+ "learning_rate": 9.315646726955798e-07,
+ "loss": 0.3844,
+ "step": 939
+ },
+ {
+ "epoch": 4.432601880877743,
+ "grad_norm": 0.7672348022460938,
+ "learning_rate": 9.263518523844211e-07,
+ "loss": 0.3827,
+ "step": 940
+ },
+ {
+ "epoch": 4.43730407523511,
+ "grad_norm": 0.54765784740448,
+ "learning_rate": 9.211503391905446e-07,
+ "loss": 0.3856,
+ "step": 941
+ },
+ {
+ "epoch": 4.4420062695924765,
+ "grad_norm": 0.5360795259475708,
+ "learning_rate": 9.159601704883253e-07,
+ "loss": 0.3902,
+ "step": 942
+ },
+ {
+ "epoch": 4.446708463949843,
+ "grad_norm": 0.5291644334793091,
+ "learning_rate": 9.107813835706303e-07,
+ "loss": 0.3617,
+ "step": 943
+ },
+ {
+ "epoch": 4.45141065830721,
+ "grad_norm": 0.5579796433448792,
+ "learning_rate": 9.056140156485385e-07,
+ "loss": 0.3777,
+ "step": 944
+ },
+ {
+ "epoch": 4.456112852664576,
+ "grad_norm": 0.7645874619483948,
+ "learning_rate": 9.004581038510865e-07,
+ "loss": 0.3877,
+ "step": 945
+ },
+ {
+ "epoch": 4.460815047021944,
+ "grad_norm": 0.5321459174156189,
+ "learning_rate": 8.953136852249922e-07,
+ "loss": 0.4057,
+ "step": 946
+ },
+ {
+ "epoch": 4.4655172413793105,
+ "grad_norm": 0.5971282720565796,
+ "learning_rate": 8.901807967343898e-07,
+ "loss": 0.3998,
+ "step": 947
+ },
+ {
+ "epoch": 4.470219435736677,
+ "grad_norm": 0.5772238373756409,
+ "learning_rate": 8.850594752605712e-07,
+ "loss": 0.3967,
+ "step": 948
+ },
+ {
+ "epoch": 4.474921630094044,
+ "grad_norm": 0.5422664284706116,
+ "learning_rate": 8.79949757601711e-07,
+ "loss": 0.3882,
+ "step": 949
+ },
+ {
+ "epoch": 4.47962382445141,
+ "grad_norm": 0.5209662914276123,
+ "learning_rate": 8.748516804726096e-07,
+ "loss": 0.3872,
+ "step": 950
+ },
+ {
+ "epoch": 4.484326018808777,
+ "grad_norm": 0.6436011791229248,
+ "learning_rate": 8.697652805044265e-07,
+ "loss": 0.3669,
+ "step": 951
+ },
+ {
+ "epoch": 4.4890282131661445,
+ "grad_norm": 0.5284281969070435,
+ "learning_rate": 8.646905942444172e-07,
+ "loss": 0.3731,
+ "step": 952
+ },
+ {
+ "epoch": 4.493730407523511,
+ "grad_norm": 0.857571542263031,
+ "learning_rate": 8.59627658155671e-07,
+ "loss": 0.3933,
+ "step": 953
+ },
+ {
+ "epoch": 4.498432601880878,
+ "grad_norm": 0.5689031481742859,
+ "learning_rate": 8.545765086168484e-07,
+ "loss": 0.3836,
+ "step": 954
+ },
+ {
+ "epoch": 4.503134796238244,
+ "grad_norm": 0.5461127758026123,
+ "learning_rate": 8.495371819219206e-07,
+ "loss": 0.3984,
+ "step": 955
+ },
+ {
+ "epoch": 4.507836990595611,
+ "grad_norm": 0.591744065284729,
+ "learning_rate": 8.44509714279908e-07,
+ "loss": 0.4096,
+ "step": 956
+ },
+ {
+ "epoch": 4.512539184952978,
+ "grad_norm": 0.5600095391273499,
+ "learning_rate": 8.394941418146202e-07,
+ "loss": 0.4012,
+ "step": 957
+ },
+ {
+ "epoch": 4.517241379310345,
+ "grad_norm": 0.5238003730773926,
+ "learning_rate": 8.344905005643967e-07,
+ "loss": 0.4019,
+ "step": 958
+ },
+ {
+ "epoch": 4.521943573667712,
+ "grad_norm": 0.5452944040298462,
+ "learning_rate": 8.294988264818488e-07,
+ "loss": 0.391,
+ "step": 959
+ },
+ {
+ "epoch": 4.5266457680250785,
+ "grad_norm": 0.570563554763794,
+ "learning_rate": 8.245191554335963e-07,
+ "loss": 0.3836,
+ "step": 960
+ },
+ {
+ "epoch": 4.531347962382445,
+ "grad_norm": 0.526006281375885,
+ "learning_rate": 8.1955152320002e-07,
+ "loss": 0.3894,
+ "step": 961
+ },
+ {
+ "epoch": 4.536050156739812,
+ "grad_norm": 0.6105053424835205,
+ "learning_rate": 8.145959654749924e-07,
+ "loss": 0.4004,
+ "step": 962
+ },
+ {
+ "epoch": 4.540752351097178,
+ "grad_norm": 0.6597625017166138,
+ "learning_rate": 8.096525178656306e-07,
+ "loss": 0.3694,
+ "step": 963
+ },
+ {
+ "epoch": 4.545454545454545,
+ "grad_norm": 0.546521782875061,
+ "learning_rate": 8.047212158920362e-07,
+ "loss": 0.397,
+ "step": 964
+ },
+ {
+ "epoch": 4.5501567398119125,
+ "grad_norm": 0.518375813961029,
+ "learning_rate": 7.998020949870402e-07,
+ "loss": 0.4126,
+ "step": 965
+ },
+ {
+ "epoch": 4.554858934169279,
+ "grad_norm": 0.6008384823799133,
+ "learning_rate": 7.948951904959504e-07,
+ "loss": 0.3799,
+ "step": 966
+ },
+ {
+ "epoch": 4.559561128526646,
+ "grad_norm": 0.5546853542327881,
+ "learning_rate": 7.900005376762948e-07,
+ "loss": 0.3894,
+ "step": 967
+ },
+ {
+ "epoch": 4.564263322884012,
+ "grad_norm": 0.5475030541419983,
+ "learning_rate": 7.851181716975703e-07,
+ "loss": 0.3977,
+ "step": 968
+ },
+ {
+ "epoch": 4.568965517241379,
+ "grad_norm": 0.5156254172325134,
+ "learning_rate": 7.802481276409896e-07,
+ "loss": 0.3635,
+ "step": 969
+ },
+ {
+ "epoch": 4.5736677115987465,
+ "grad_norm": 0.5934706330299377,
+ "learning_rate": 7.75390440499228e-07,
+ "loss": 0.3735,
+ "step": 970
+ },
+ {
+ "epoch": 4.578369905956113,
+ "grad_norm": 0.5446907877922058,
+ "learning_rate": 7.705451451761734e-07,
+ "loss": 0.3722,
+ "step": 971
+ },
+ {
+ "epoch": 4.58307210031348,
+ "grad_norm": 0.5843047499656677,
+ "learning_rate": 7.657122764866754e-07,
+ "loss": 0.37,
+ "step": 972
+ },
+ {
+ "epoch": 4.587774294670846,
+ "grad_norm": 0.5700147747993469,
+ "learning_rate": 7.608918691562914e-07,
+ "loss": 0.4071,
+ "step": 973
+ },
+ {
+ "epoch": 4.592476489028213,
+ "grad_norm": 0.5433696508407593,
+ "learning_rate": 7.560839578210466e-07,
+ "loss": 0.371,
+ "step": 974
+ },
+ {
+ "epoch": 4.59717868338558,
+ "grad_norm": 1.282175064086914,
+ "learning_rate": 7.512885770271722e-07,
+ "loss": 0.3936,
+ "step": 975
+ },
+ {
+ "epoch": 4.601880877742946,
+ "grad_norm": 0.6217600107192993,
+ "learning_rate": 7.465057612308676e-07,
+ "loss": 0.3902,
+ "step": 976
+ },
+ {
+ "epoch": 4.606583072100314,
+ "grad_norm": 0.536109983921051,
+ "learning_rate": 7.417355447980484e-07,
+ "loss": 0.3955,
+ "step": 977
+ },
+ {
+ "epoch": 4.61128526645768,
+ "grad_norm": 0.5526042580604553,
+ "learning_rate": 7.369779620041001e-07,
+ "loss": 0.382,
+ "step": 978
+ },
+ {
+ "epoch": 4.615987460815047,
+ "grad_norm": 0.5479426980018616,
+ "learning_rate": 7.322330470336314e-07,
+ "loss": 0.4093,
+ "step": 979
+ },
+ {
+ "epoch": 4.620689655172414,
+ "grad_norm": 0.7818365693092346,
+ "learning_rate": 7.275008339802295e-07,
+ "loss": 0.3924,
+ "step": 980
+ },
+ {
+ "epoch": 4.62539184952978,
+ "grad_norm": 0.5750322937965393,
+ "learning_rate": 7.227813568462141e-07,
+ "loss": 0.3742,
+ "step": 981
+ },
+ {
+ "epoch": 4.630094043887147,
+ "grad_norm": 0.5750429034233093,
+ "learning_rate": 7.180746495423946e-07,
+ "loss": 0.3914,
+ "step": 982
+ },
+ {
+ "epoch": 4.6347962382445145,
+ "grad_norm": 0.5530590415000916,
+ "learning_rate": 7.133807458878247e-07,
+ "loss": 0.3896,
+ "step": 983
+ },
+ {
+ "epoch": 4.639498432601881,
+ "grad_norm": 0.5401444435119629,
+ "learning_rate": 7.086996796095599e-07,
+ "loss": 0.3832,
+ "step": 984
+ },
+ {
+ "epoch": 4.644200626959248,
+ "grad_norm": 0.5471640229225159,
+ "learning_rate": 7.040314843424173e-07,
+ "loss": 0.3922,
+ "step": 985
+ },
+ {
+ "epoch": 4.648902821316614,
+ "grad_norm": 0.5962896943092346,
+ "learning_rate": 6.99376193628728e-07,
+ "loss": 0.3839,
+ "step": 986
+ },
+ {
+ "epoch": 4.653605015673981,
+ "grad_norm": 0.5511114597320557,
+ "learning_rate": 6.947338409181056e-07,
+ "loss": 0.3867,
+ "step": 987
+ },
+ {
+ "epoch": 4.658307210031348,
+ "grad_norm": 0.5311186909675598,
+ "learning_rate": 6.90104459567196e-07,
+ "loss": 0.3886,
+ "step": 988
+ },
+ {
+ "epoch": 4.663009404388715,
+ "grad_norm": 0.7723526358604431,
+ "learning_rate": 6.854880828394442e-07,
+ "loss": 0.379,
+ "step": 989
+ },
+ {
+ "epoch": 4.667711598746082,
+ "grad_norm": 0.5676357746124268,
+ "learning_rate": 6.808847439048524e-07,
+ "loss": 0.4067,
+ "step": 990
+ },
+ {
+ "epoch": 4.672413793103448,
+ "grad_norm": 0.9501140713691711,
+ "learning_rate": 6.762944758397432e-07,
+ "loss": 0.3914,
+ "step": 991
+ },
+ {
+ "epoch": 4.677115987460815,
+ "grad_norm": 0.5385439395904541,
+ "learning_rate": 6.717173116265208e-07,
+ "loss": 0.3842,
+ "step": 992
+ },
+ {
+ "epoch": 4.681818181818182,
+ "grad_norm": 0.5315724611282349,
+ "learning_rate": 6.671532841534345e-07,
+ "loss": 0.3952,
+ "step": 993
+ },
+ {
+ "epoch": 4.686520376175548,
+ "grad_norm": 0.580390214920044,
+ "learning_rate": 6.626024262143421e-07,
+ "loss": 0.4011,
+ "step": 994
+ },
+ {
+ "epoch": 4.691222570532915,
+ "grad_norm": 0.5717929005622864,
+ "learning_rate": 6.58064770508475e-07,
+ "loss": 0.3848,
+ "step": 995
+ },
+ {
+ "epoch": 4.695924764890282,
+ "grad_norm": 0.7644345164299011,
+ "learning_rate": 6.535403496402023e-07,
+ "loss": 0.3718,
+ "step": 996
+ },
+ {
+ "epoch": 4.700626959247649,
+ "grad_norm": 0.8252847790718079,
+ "learning_rate": 6.490291961187975e-07,
+ "loss": 0.3756,
+ "step": 997
+ },
+ {
+ "epoch": 4.705329153605016,
+ "grad_norm": 0.6276743412017822,
+ "learning_rate": 6.445313423582039e-07,
+ "loss": 0.4097,
+ "step": 998
+ },
+ {
+ "epoch": 4.710031347962382,
+ "grad_norm": 0.5425130724906921,
+ "learning_rate": 6.400468206768004e-07,
+ "loss": 0.3926,
+ "step": 999
+ },
+ {
+ "epoch": 4.714733542319749,
+ "grad_norm": 0.5565195083618164,
+ "learning_rate": 6.35575663297176e-07,
+ "loss": 0.3973,
+ "step": 1000
+ },
+ {
+ "epoch": 4.7194357366771165,
+ "grad_norm": 0.5730810165405273,
+ "learning_rate": 6.31117902345888e-07,
+ "loss": 0.3448,
+ "step": 1001
+ },
+ {
+ "epoch": 4.724137931034483,
+ "grad_norm": 0.6187518835067749,
+ "learning_rate": 6.266735698532392e-07,
+ "loss": 0.387,
+ "step": 1002
+ },
+ {
+ "epoch": 4.72884012539185,
+ "grad_norm": 0.5731320381164551,
+ "learning_rate": 6.222426977530449e-07,
+ "loss": 0.4064,
+ "step": 1003
+ },
+ {
+ "epoch": 4.733542319749216,
+ "grad_norm": 0.5795004367828369,
+ "learning_rate": 6.178253178824029e-07,
+ "loss": 0.3985,
+ "step": 1004
+ },
+ {
+ "epoch": 4.738244514106583,
+ "grad_norm": 0.5685634016990662,
+ "learning_rate": 6.134214619814657e-07,
+ "loss": 0.3817,
+ "step": 1005
+ },
+ {
+ "epoch": 4.74294670846395,
+ "grad_norm": 0.5926253199577332,
+ "learning_rate": 6.090311616932127e-07,
+ "loss": 0.3735,
+ "step": 1006
+ },
+ {
+ "epoch": 4.747648902821316,
+ "grad_norm": 0.5256511569023132,
+ "learning_rate": 6.04654448563221e-07,
+ "loss": 0.3805,
+ "step": 1007
+ },
+ {
+ "epoch": 4.752351097178684,
+ "grad_norm": 0.5808703303337097,
+ "learning_rate": 6.002913540394417e-07,
+ "loss": 0.3615,
+ "step": 1008
+ },
+ {
+ "epoch": 4.75705329153605,
+ "grad_norm": 0.5645278692245483,
+ "learning_rate": 5.959419094719713e-07,
+ "loss": 0.405,
+ "step": 1009
+ },
+ {
+ "epoch": 4.761755485893417,
+ "grad_norm": 0.535028874874115,
+ "learning_rate": 5.916061461128269e-07,
+ "loss": 0.3823,
+ "step": 1010
+ },
+ {
+ "epoch": 4.766457680250784,
+ "grad_norm": 0.5427082180976868,
+ "learning_rate": 5.872840951157241e-07,
+ "loss": 0.3643,
+ "step": 1011
+ },
+ {
+ "epoch": 4.77115987460815,
+ "grad_norm": 0.5948965549468994,
+ "learning_rate": 5.829757875358477e-07,
+ "loss": 0.3834,
+ "step": 1012
+ },
+ {
+ "epoch": 4.775862068965517,
+ "grad_norm": 1.4611191749572754,
+ "learning_rate": 5.786812543296372e-07,
+ "loss": 0.3923,
+ "step": 1013
+ },
+ {
+ "epoch": 4.7805642633228835,
+ "grad_norm": 0.5925397276878357,
+ "learning_rate": 5.744005263545538e-07,
+ "loss": 0.4105,
+ "step": 1014
+ },
+ {
+ "epoch": 4.785266457680251,
+ "grad_norm": 0.5865631103515625,
+ "learning_rate": 5.701336343688671e-07,
+ "loss": 0.4086,
+ "step": 1015
+ },
+ {
+ "epoch": 4.789968652037618,
+ "grad_norm": 0.5993569493293762,
+ "learning_rate": 5.658806090314322e-07,
+ "loss": 0.3738,
+ "step": 1016
+ },
+ {
+ "epoch": 4.794670846394984,
+ "grad_norm": 0.5465255975723267,
+ "learning_rate": 5.616414809014647e-07,
+ "loss": 0.3801,
+ "step": 1017
+ },
+ {
+ "epoch": 4.799373040752351,
+ "grad_norm": 0.5121073722839355,
+ "learning_rate": 5.574162804383293e-07,
+ "loss": 0.3896,
+ "step": 1018
+ },
+ {
+ "epoch": 4.804075235109718,
+ "grad_norm": 0.5888665318489075,
+ "learning_rate": 5.532050380013115e-07,
+ "loss": 0.3833,
+ "step": 1019
+ },
+ {
+ "epoch": 4.808777429467085,
+ "grad_norm": 0.5188261866569519,
+ "learning_rate": 5.490077838494079e-07,
+ "loss": 0.4127,
+ "step": 1020
+ },
+ {
+ "epoch": 4.813479623824452,
+ "grad_norm": 0.5498382449150085,
+ "learning_rate": 5.448245481411041e-07,
+ "loss": 0.3933,
+ "step": 1021
+ },
+ {
+ "epoch": 4.818181818181818,
+ "grad_norm": 0.5509280562400818,
+ "learning_rate": 5.406553609341586e-07,
+ "loss": 0.3912,
+ "step": 1022
+ },
+ {
+ "epoch": 4.822884012539185,
+ "grad_norm": 0.5588513612747192,
+ "learning_rate": 5.365002521853882e-07,
+ "loss": 0.3757,
+ "step": 1023
+ },
+ {
+ "epoch": 4.827586206896552,
+ "grad_norm": 0.5885221362113953,
+ "learning_rate": 5.32359251750452e-07,
+ "loss": 0.3836,
+ "step": 1024
+ },
+ {
+ "epoch": 4.832288401253918,
+ "grad_norm": 0.7824872136116028,
+ "learning_rate": 5.282323893836347e-07,
+ "loss": 0.4078,
+ "step": 1025
+ },
+ {
+ "epoch": 4.836990595611285,
+ "grad_norm": 0.5329296588897705,
+ "learning_rate": 5.241196947376382e-07,
+ "loss": 0.3844,
+ "step": 1026
+ },
+ {
+ "epoch": 4.841692789968652,
+ "grad_norm": 0.5577712059020996,
+ "learning_rate": 5.200211973633632e-07,
+ "loss": 0.4107,
+ "step": 1027
+ },
+ {
+ "epoch": 4.846394984326019,
+ "grad_norm": 0.854481041431427,
+ "learning_rate": 5.15936926709699e-07,
+ "loss": 0.3988,
+ "step": 1028
+ },
+ {
+ "epoch": 4.851097178683386,
+ "grad_norm": 0.5857868790626526,
+ "learning_rate": 5.118669121233127e-07,
+ "loss": 0.3935,
+ "step": 1029
+ },
+ {
+ "epoch": 4.855799373040752,
+ "grad_norm": 0.5981507897377014,
+ "learning_rate": 5.078111828484347e-07,
+ "loss": 0.3914,
+ "step": 1030
+ },
+ {
+ "epoch": 4.860501567398119,
+ "grad_norm": 0.5649446845054626,
+ "learning_rate": 5.037697680266565e-07,
+ "loss": 0.3961,
+ "step": 1031
+ },
+ {
+ "epoch": 4.8652037617554855,
+ "grad_norm": 0.5941659808158875,
+ "learning_rate": 4.997426966967106e-07,
+ "loss": 0.3942,
+ "step": 1032
+ },
+ {
+ "epoch": 4.869905956112853,
+ "grad_norm": 0.581913411617279,
+ "learning_rate": 4.957299977942704e-07,
+ "loss": 0.3806,
+ "step": 1033
+ },
+ {
+ "epoch": 4.87460815047022,
+ "grad_norm": 0.5254392027854919,
+ "learning_rate": 4.917317001517389e-07,
+ "loss": 0.3859,
+ "step": 1034
+ },
+ {
+ "epoch": 4.879310344827586,
+ "grad_norm": 0.5529137849807739,
+ "learning_rate": 4.877478324980412e-07,
+ "loss": 0.4055,
+ "step": 1035
+ },
+ {
+ "epoch": 4.884012539184953,
+ "grad_norm": 0.5569112300872803,
+ "learning_rate": 4.837784234584194e-07,
+ "loss": 0.3771,
+ "step": 1036
+ },
+ {
+ "epoch": 4.88871473354232,
+ "grad_norm": 0.6729010343551636,
+ "learning_rate": 4.79823501554226e-07,
+ "loss": 0.3983,
+ "step": 1037
+ },
+ {
+ "epoch": 4.893416927899686,
+ "grad_norm": 0.5438387989997864,
+ "learning_rate": 4.7588309520271934e-07,
+ "loss": 0.3805,
+ "step": 1038
+ },
+ {
+ "epoch": 4.898119122257054,
+ "grad_norm": 0.5601168870925903,
+ "learning_rate": 4.7195723271685893e-07,
+ "loss": 0.413,
+ "step": 1039
+ },
+ {
+ "epoch": 4.90282131661442,
+ "grad_norm": 0.5603858232498169,
+ "learning_rate": 4.6804594230510286e-07,
+ "loss": 0.4093,
+ "step": 1040
+ },
+ {
+ "epoch": 4.907523510971787,
+ "grad_norm": 0.5581585764884949,
+ "learning_rate": 4.641492520712043e-07,
+ "loss": 0.3877,
+ "step": 1041
+ },
+ {
+ "epoch": 4.912225705329154,
+ "grad_norm": 0.6802616119384766,
+ "learning_rate": 4.60267190014011e-07,
+ "loss": 0.3965,
+ "step": 1042
+ },
+ {
+ "epoch": 4.91692789968652,
+ "grad_norm": 0.5508768558502197,
+ "learning_rate": 4.563997840272602e-07,
+ "loss": 0.3833,
+ "step": 1043
+ },
+ {
+ "epoch": 4.921630094043887,
+ "grad_norm": 0.9818223714828491,
+ "learning_rate": 4.5254706189938545e-07,
+ "loss": 0.3689,
+ "step": 1044
+ },
+ {
+ "epoch": 4.9263322884012535,
+ "grad_norm": 0.5540556907653809,
+ "learning_rate": 4.4870905131330827e-07,
+ "loss": 0.4081,
+ "step": 1045
+ },
+ {
+ "epoch": 4.931034482758621,
+ "grad_norm": 0.5338829159736633,
+ "learning_rate": 4.448857798462455e-07,
+ "loss": 0.4071,
+ "step": 1046
+ },
+ {
+ "epoch": 4.935736677115988,
+ "grad_norm": 0.5587465763092041,
+ "learning_rate": 4.4107727496950913e-07,
+ "loss": 0.3801,
+ "step": 1047
+ },
+ {
+ "epoch": 4.940438871473354,
+ "grad_norm": 0.5150395631790161,
+ "learning_rate": 4.372835640483089e-07,
+ "loss": 0.4002,
+ "step": 1048
+ },
+ {
+ "epoch": 4.945141065830721,
+ "grad_norm": 0.5582529902458191,
+ "learning_rate": 4.3350467434155526e-07,
+ "loss": 0.393,
+ "step": 1049
+ },
+ {
+ "epoch": 4.9498432601880875,
+ "grad_norm": 0.5755763649940491,
+ "learning_rate": 4.297406330016643e-07,
+ "loss": 0.3838,
+ "step": 1050
+ },
+ {
+ "epoch": 4.954545454545455,
+ "grad_norm": 0.5632887482643127,
+ "learning_rate": 4.25991467074362e-07,
+ "loss": 0.3752,
+ "step": 1051
+ },
+ {
+ "epoch": 4.959247648902822,
+ "grad_norm": 0.5089020729064941,
+ "learning_rate": 4.2225720349849063e-07,
+ "loss": 0.3873,
+ "step": 1052
+ },
+ {
+ "epoch": 4.963949843260188,
+ "grad_norm": 0.6206353306770325,
+ "learning_rate": 4.185378691058145e-07,
+ "loss": 0.3837,
+ "step": 1053
+ },
+ {
+ "epoch": 4.968652037617555,
+ "grad_norm": 0.8949421048164368,
+ "learning_rate": 4.148334906208273e-07,
+ "loss": 0.4126,
+ "step": 1054
+ },
+ {
+ "epoch": 4.9733542319749215,
+ "grad_norm": 0.5514953136444092,
+ "learning_rate": 4.1114409466056107e-07,
+ "loss": 0.3897,
+ "step": 1055
+ },
+ {
+ "epoch": 4.978056426332288,
+ "grad_norm": 0.681344211101532,
+ "learning_rate": 4.0746970773439115e-07,
+ "loss": 0.4165,
+ "step": 1056
+ },
+ {
+ "epoch": 4.982758620689655,
+ "grad_norm": 0.5986515283584595,
+ "learning_rate": 4.0381035624385336e-07,
+ "loss": 0.4007,
+ "step": 1057
+ },
+ {
+ "epoch": 4.987460815047022,
+ "grad_norm": 0.502730131149292,
+ "learning_rate": 4.0016606648244555e-07,
+ "loss": 0.3614,
+ "step": 1058
+ },
+ {
+ "epoch": 4.992163009404389,
+ "grad_norm": 0.5898148417472839,
+ "learning_rate": 3.9653686463544447e-07,
+ "loss": 0.4064,
+ "step": 1059
+ },
+ {
+ "epoch": 4.996865203761756,
+ "grad_norm": 0.6192370057106018,
+ "learning_rate": 3.929227767797153e-07,
+ "loss": 0.4027,
+ "step": 1060
+ },
+ {
+ "epoch": 5.004702194357367,
+ "grad_norm": 1.189609408378601,
+ "learning_rate": 3.8932382888352547e-07,
+ "loss": 0.7715,
+ "step": 1061
+ },
+ {
+ "epoch": 5.009404388714733,
+ "grad_norm": 0.552861750125885,
+ "learning_rate": 3.8574004680635686e-07,
+ "loss": 0.3696,
+ "step": 1062
+ },
+ {
+ "epoch": 5.0141065830721,
+ "grad_norm": 0.564900279045105,
+ "learning_rate": 3.8217145629872054e-07,
+ "loss": 0.3891,
+ "step": 1063
+ },
+ {
+ "epoch": 5.018808777429467,
+ "grad_norm": 0.527047872543335,
+ "learning_rate": 3.786180830019717e-07,
+ "loss": 0.4155,
+ "step": 1064
+ },
+ {
+ "epoch": 5.023510971786834,
+ "grad_norm": 0.5116733312606812,
+ "learning_rate": 3.7507995244812636e-07,
+ "loss": 0.3321,
+ "step": 1065
+ },
+ {
+ "epoch": 5.028213166144201,
+ "grad_norm": 0.5564514398574829,
+ "learning_rate": 3.7155709005967544e-07,
+ "loss": 0.3762,
+ "step": 1066
+ },
+ {
+ "epoch": 5.032915360501567,
+ "grad_norm": 0.5802388787269592,
+ "learning_rate": 3.6804952114940504e-07,
+ "loss": 0.3683,
+ "step": 1067
+ },
+ {
+ "epoch": 5.037617554858934,
+ "grad_norm": 0.5543890595436096,
+ "learning_rate": 3.645572709202136e-07,
+ "loss": 0.3618,
+ "step": 1068
+ },
+ {
+ "epoch": 5.0423197492163006,
+ "grad_norm": 0.5662654042243958,
+ "learning_rate": 3.610803644649269e-07,
+ "loss": 0.3389,
+ "step": 1069
+ },
+ {
+ "epoch": 5.047021943573668,
+ "grad_norm": 0.5756992697715759,
+ "learning_rate": 3.576188267661271e-07,
+ "loss": 0.3739,
+ "step": 1070
+ },
+ {
+ "epoch": 5.051724137931035,
+ "grad_norm": 0.5667081475257874,
+ "learning_rate": 3.5417268269596186e-07,
+ "loss": 0.3877,
+ "step": 1071
+ },
+ {
+ "epoch": 5.056426332288401,
+ "grad_norm": 0.5901455879211426,
+ "learning_rate": 3.5074195701597423e-07,
+ "loss": 0.3767,
+ "step": 1072
+ },
+ {
+ "epoch": 5.061128526645768,
+ "grad_norm": 0.5125202536582947,
+ "learning_rate": 3.4732667437692075e-07,
+ "loss": 0.3873,
+ "step": 1073
+ },
+ {
+ "epoch": 5.065830721003135,
+ "grad_norm": 0.5557279586791992,
+ "learning_rate": 3.439268593185957e-07,
+ "loss": 0.378,
+ "step": 1074
+ },
+ {
+ "epoch": 5.070532915360501,
+ "grad_norm": 1.2608096599578857,
+ "learning_rate": 3.4054253626965404e-07,
+ "loss": 0.3515,
+ "step": 1075
+ },
+ {
+ "epoch": 5.075235109717869,
+ "grad_norm": 0.7257769703865051,
+ "learning_rate": 3.371737295474359e-07,
+ "loss": 0.3861,
+ "step": 1076
+ },
+ {
+ "epoch": 5.079937304075235,
+ "grad_norm": 0.5687916278839111,
+ "learning_rate": 3.338204633577924e-07,
+ "loss": 0.3622,
+ "step": 1077
+ },
+ {
+ "epoch": 5.084639498432602,
+ "grad_norm": 0.567475438117981,
+ "learning_rate": 3.3048276179491135e-07,
+ "loss": 0.4064,
+ "step": 1078
+ },
+ {
+ "epoch": 5.089341692789969,
+ "grad_norm": 0.5369480848312378,
+ "learning_rate": 3.271606488411447e-07,
+ "loss": 0.3735,
+ "step": 1079
+ },
+ {
+ "epoch": 5.094043887147335,
+ "grad_norm": 0.5459705591201782,
+ "learning_rate": 3.238541483668345e-07,
+ "loss": 0.3414,
+ "step": 1080
+ },
+ {
+ "epoch": 5.098746081504702,
+ "grad_norm": 0.5372116565704346,
+ "learning_rate": 3.2056328413014456e-07,
+ "loss": 0.3983,
+ "step": 1081
+ },
+ {
+ "epoch": 5.103448275862069,
+ "grad_norm": 0.543656051158905,
+ "learning_rate": 3.172880797768849e-07,
+ "loss": 0.3658,
+ "step": 1082
+ },
+ {
+ "epoch": 5.108150470219436,
+ "grad_norm": 0.5288692712783813,
+ "learning_rate": 3.1402855884034856e-07,
+ "loss": 0.3577,
+ "step": 1083
+ },
+ {
+ "epoch": 5.112852664576803,
+ "grad_norm": 0.531522810459137,
+ "learning_rate": 3.1078474474113497e-07,
+ "loss": 0.3677,
+ "step": 1084
+ },
+ {
+ "epoch": 5.117554858934169,
+ "grad_norm": 0.561164915561676,
+ "learning_rate": 3.075566607869876e-07,
+ "loss": 0.3906,
+ "step": 1085
+ },
+ {
+ "epoch": 5.122257053291536,
+ "grad_norm": 0.5604991912841797,
+ "learning_rate": 3.04344330172624e-07,
+ "loss": 0.3733,
+ "step": 1086
+ },
+ {
+ "epoch": 5.1269592476489025,
+ "grad_norm": 0.6196678280830383,
+ "learning_rate": 3.0114777597956835e-07,
+ "loss": 0.3537,
+ "step": 1087
+ },
+ {
+ "epoch": 5.131661442006269,
+ "grad_norm": 0.5680385828018188,
+ "learning_rate": 2.9796702117598884e-07,
+ "loss": 0.3789,
+ "step": 1088
+ },
+ {
+ "epoch": 5.136363636363637,
+ "grad_norm": 0.559177041053772,
+ "learning_rate": 2.948020886165279e-07,
+ "loss": 0.3754,
+ "step": 1089
+ },
+ {
+ "epoch": 5.141065830721003,
+ "grad_norm": 0.703450083732605,
+ "learning_rate": 2.91653001042142e-07,
+ "loss": 0.3911,
+ "step": 1090
+ },
+ {
+ "epoch": 5.14576802507837,
+ "grad_norm": 0.6118397116661072,
+ "learning_rate": 2.885197810799367e-07,
+ "loss": 0.3464,
+ "step": 1091
+ },
+ {
+ "epoch": 5.150470219435737,
+ "grad_norm": 0.5417131781578064,
+ "learning_rate": 2.854024512430043e-07,
+ "loss": 0.3792,
+ "step": 1092
+ },
+ {
+ "epoch": 5.155172413793103,
+ "grad_norm": 0.5390523076057434,
+ "learning_rate": 2.8230103393026094e-07,
+ "loss": 0.3644,
+ "step": 1093
+ },
+ {
+ "epoch": 5.15987460815047,
+ "grad_norm": 0.5768479108810425,
+ "learning_rate": 2.792155514262887e-07,
+ "loss": 0.3829,
+ "step": 1094
+ },
+ {
+ "epoch": 5.164576802507837,
+ "grad_norm": 0.5356476902961731,
+ "learning_rate": 2.761460259011703e-07,
+ "loss": 0.3513,
+ "step": 1095
+ },
+ {
+ "epoch": 5.169278996865204,
+ "grad_norm": 0.7185446619987488,
+ "learning_rate": 2.7309247941033623e-07,
+ "loss": 0.3878,
+ "step": 1096
+ },
+ {
+ "epoch": 5.173981191222571,
+ "grad_norm": 0.5513623952865601,
+ "learning_rate": 2.700549338944014e-07,
+ "loss": 0.3639,
+ "step": 1097
+ },
+ {
+ "epoch": 5.178683385579937,
+ "grad_norm": 0.5492207407951355,
+ "learning_rate": 2.6703341117900905e-07,
+ "loss": 0.3822,
+ "step": 1098
+ },
+ {
+ "epoch": 5.183385579937304,
+ "grad_norm": 0.560459554195404,
+ "learning_rate": 2.6402793297467476e-07,
+ "loss": 0.3906,
+ "step": 1099
+ },
+ {
+ "epoch": 5.1880877742946705,
+ "grad_norm": 0.6651716232299805,
+ "learning_rate": 2.6103852087662753e-07,
+ "loss": 0.3616,
+ "step": 1100
+ },
+ {
+ "epoch": 5.192789968652038,
+ "grad_norm": 0.5527139902114868,
+ "learning_rate": 2.580651963646602e-07,
+ "loss": 0.3774,
+ "step": 1101
+ },
+ {
+ "epoch": 5.197492163009405,
+ "grad_norm": 0.6211456656455994,
+ "learning_rate": 2.5510798080296827e-07,
+ "loss": 0.3761,
+ "step": 1102
+ },
+ {
+ "epoch": 5.202194357366771,
+ "grad_norm": 0.5848529934883118,
+ "learning_rate": 2.5216689544000193e-07,
+ "loss": 0.3836,
+ "step": 1103
+ },
+ {
+ "epoch": 5.206896551724138,
+ "grad_norm": 0.5585032105445862,
+ "learning_rate": 2.4924196140831027e-07,
+ "loss": 0.4037,
+ "step": 1104
+ },
+ {
+ "epoch": 5.2115987460815045,
+ "grad_norm": 0.5405401587486267,
+ "learning_rate": 2.4633319972439064e-07,
+ "loss": 0.3546,
+ "step": 1105
+ },
+ {
+ "epoch": 5.216300940438871,
+ "grad_norm": 0.585942268371582,
+ "learning_rate": 2.434406312885376e-07,
+ "loss": 0.3555,
+ "step": 1106
+ },
+ {
+ "epoch": 5.221003134796238,
+ "grad_norm": 0.522100567817688,
+ "learning_rate": 2.405642768846925e-07,
+ "loss": 0.3893,
+ "step": 1107
+ },
+ {
+ "epoch": 5.225705329153605,
+ "grad_norm": 0.5989215970039368,
+ "learning_rate": 2.3770415718029349e-07,
+ "loss": 0.3627,
+ "step": 1108
+ },
+ {
+ "epoch": 5.230407523510972,
+ "grad_norm": 0.5293803215026855,
+ "learning_rate": 2.3486029272612842e-07,
+ "loss": 0.3911,
+ "step": 1109
+ },
+ {
+ "epoch": 5.235109717868339,
+ "grad_norm": 0.5886375308036804,
+ "learning_rate": 2.320327039561865e-07,
+ "loss": 0.3702,
+ "step": 1110
+ },
+ {
+ "epoch": 5.239811912225705,
+ "grad_norm": 0.6012598872184753,
+ "learning_rate": 2.29221411187511e-07,
+ "loss": 0.3991,
+ "step": 1111
+ },
+ {
+ "epoch": 5.244514106583072,
+ "grad_norm": 0.5959130525588989,
+ "learning_rate": 2.2642643462005454e-07,
+ "loss": 0.4047,
+ "step": 1112
+ },
+ {
+ "epoch": 5.2492163009404385,
+ "grad_norm": 0.5763290524482727,
+ "learning_rate": 2.236477943365309e-07,
+ "loss": 0.3799,
+ "step": 1113
+ },
+ {
+ "epoch": 5.253918495297806,
+ "grad_norm": 0.5340325832366943,
+ "learning_rate": 2.2088551030227668e-07,
+ "loss": 0.3864,
+ "step": 1114
+ },
+ {
+ "epoch": 5.258620689655173,
+ "grad_norm": 0.817264974117279,
+ "learning_rate": 2.181396023651003e-07,
+ "loss": 0.3925,
+ "step": 1115
+ },
+ {
+ "epoch": 5.263322884012539,
+ "grad_norm": 0.5827222466468811,
+ "learning_rate": 2.1541009025514536e-07,
+ "loss": 0.3642,
+ "step": 1116
+ },
+ {
+ "epoch": 5.268025078369906,
+ "grad_norm": 0.5535173416137695,
+ "learning_rate": 2.1269699358474617e-07,
+ "loss": 0.3774,
+ "step": 1117
+ },
+ {
+ "epoch": 5.2727272727272725,
+ "grad_norm": 0.5569525361061096,
+ "learning_rate": 2.100003318482871e-07,
+ "loss": 0.388,
+ "step": 1118
+ },
+ {
+ "epoch": 5.277429467084639,
+ "grad_norm": 0.7138118743896484,
+ "learning_rate": 2.073201244220635e-07,
+ "loss": 0.3932,
+ "step": 1119
+ },
+ {
+ "epoch": 5.282131661442007,
+ "grad_norm": 0.5733665227890015,
+ "learning_rate": 2.0465639056414106e-07,
+ "loss": 0.3672,
+ "step": 1120
+ },
+ {
+ "epoch": 5.286833855799373,
+ "grad_norm": 1.4797688722610474,
+ "learning_rate": 2.0200914941421817e-07,
+ "loss": 0.3591,
+ "step": 1121
+ },
+ {
+ "epoch": 5.29153605015674,
+ "grad_norm": 0.5619657635688782,
+ "learning_rate": 1.9937841999348866e-07,
+ "loss": 0.3867,
+ "step": 1122
+ },
+ {
+ "epoch": 5.2962382445141065,
+ "grad_norm": 0.6267490386962891,
+ "learning_rate": 1.9676422120450455e-07,
+ "loss": 0.3697,
+ "step": 1123
+ },
+ {
+ "epoch": 5.300940438871473,
+ "grad_norm": 0.5587145090103149,
+ "learning_rate": 1.9416657183104038e-07,
+ "loss": 0.3906,
+ "step": 1124
+ },
+ {
+ "epoch": 5.30564263322884,
+ "grad_norm": 0.6546695828437805,
+ "learning_rate": 1.915854905379594e-07,
+ "loss": 0.4236,
+ "step": 1125
+ },
+ {
+ "epoch": 5.310344827586207,
+ "grad_norm": 0.517690122127533,
+ "learning_rate": 1.8902099587107592e-07,
+ "loss": 0.3941,
+ "step": 1126
+ },
+ {
+ "epoch": 5.315047021943574,
+ "grad_norm": 0.589698314666748,
+ "learning_rate": 1.8647310625702796e-07,
+ "loss": 0.3799,
+ "step": 1127
+ },
+ {
+ "epoch": 5.3197492163009406,
+ "grad_norm": 0.591764509677887,
+ "learning_rate": 1.8394184000313815e-07,
+ "loss": 0.3836,
+ "step": 1128
+ },
+ {
+ "epoch": 5.324451410658307,
+ "grad_norm": 0.5994925498962402,
+ "learning_rate": 1.814272152972879e-07,
+ "loss": 0.3801,
+ "step": 1129
+ },
+ {
+ "epoch": 5.329153605015674,
+ "grad_norm": 0.5589796304702759,
+ "learning_rate": 1.78929250207783e-07,
+ "loss": 0.3535,
+ "step": 1130
+ },
+ {
+ "epoch": 5.33385579937304,
+ "grad_norm": 0.567366361618042,
+ "learning_rate": 1.7644796268322523e-07,
+ "loss": 0.4041,
+ "step": 1131
+ },
+ {
+ "epoch": 5.338557993730408,
+ "grad_norm": 0.508116602897644,
+ "learning_rate": 1.7398337055238385e-07,
+ "loss": 0.3799,
+ "step": 1132
+ },
+ {
+ "epoch": 5.343260188087775,
+ "grad_norm": 0.5289191007614136,
+ "learning_rate": 1.7153549152406608e-07,
+ "loss": 0.3633,
+ "step": 1133
+ },
+ {
+ "epoch": 5.347962382445141,
+ "grad_norm": 0.5444520115852356,
+ "learning_rate": 1.6910434318699153e-07,
+ "loss": 0.3718,
+ "step": 1134
+ },
+ {
+ "epoch": 5.352664576802508,
+ "grad_norm": 0.5548412799835205,
+ "learning_rate": 1.6668994300966385e-07,
+ "loss": 0.3903,
+ "step": 1135
+ },
+ {
+ "epoch": 5.3573667711598745,
+ "grad_norm": 0.6446338891983032,
+ "learning_rate": 1.642923083402473e-07,
+ "loss": 0.3797,
+ "step": 1136
+ },
+ {
+ "epoch": 5.362068965517241,
+ "grad_norm": 0.5460954904556274,
+ "learning_rate": 1.6191145640644057e-07,
+ "loss": 0.346,
+ "step": 1137
+ },
+ {
+ "epoch": 5.366771159874608,
+ "grad_norm": 0.5421894192695618,
+ "learning_rate": 1.5954740431535442e-07,
+ "loss": 0.3714,
+ "step": 1138
+ },
+ {
+ "epoch": 5.371473354231975,
+ "grad_norm": 0.5762758851051331,
+ "learning_rate": 1.5720016905338558e-07,
+ "loss": 0.395,
+ "step": 1139
+ },
+ {
+ "epoch": 5.376175548589342,
+ "grad_norm": 0.537996232509613,
+ "learning_rate": 1.548697674861005e-07,
+ "loss": 0.3712,
+ "step": 1140
+ },
+ {
+ "epoch": 5.3808777429467085,
+ "grad_norm": 0.6042187809944153,
+ "learning_rate": 1.5255621635810737e-07,
+ "loss": 0.3979,
+ "step": 1141
+ },
+ {
+ "epoch": 5.385579937304075,
+ "grad_norm": 0.5573827028274536,
+ "learning_rate": 1.5025953229294094e-07,
+ "loss": 0.3671,
+ "step": 1142
+ },
+ {
+ "epoch": 5.390282131661442,
+ "grad_norm": 0.5528206825256348,
+ "learning_rate": 1.4797973179294072e-07,
+ "loss": 0.3635,
+ "step": 1143
+ },
+ {
+ "epoch": 5.394984326018808,
+ "grad_norm": 0.5804945826530457,
+ "learning_rate": 1.45716831239133e-07,
+ "loss": 0.3921,
+ "step": 1144
+ },
+ {
+ "epoch": 5.399686520376176,
+ "grad_norm": 0.585839033126831,
+ "learning_rate": 1.4347084689111307e-07,
+ "loss": 0.3827,
+ "step": 1145
+ },
+ {
+ "epoch": 5.4043887147335425,
+ "grad_norm": 0.5442454814910889,
+ "learning_rate": 1.4124179488692823e-07,
+ "loss": 0.371,
+ "step": 1146
+ },
+ {
+ "epoch": 5.409090909090909,
+ "grad_norm": 0.6282792687416077,
+ "learning_rate": 1.3902969124296228e-07,
+ "loss": 0.3871,
+ "step": 1147
+ },
+ {
+ "epoch": 5.413793103448276,
+ "grad_norm": 0.5381441116333008,
+ "learning_rate": 1.3683455185382e-07,
+ "loss": 0.3812,
+ "step": 1148
+ },
+ {
+ "epoch": 5.418495297805642,
+ "grad_norm": 0.5682828426361084,
+ "learning_rate": 1.3465639249221313e-07,
+ "loss": 0.4092,
+ "step": 1149
+ },
+ {
+ "epoch": 5.423197492163009,
+ "grad_norm": 0.5846763253211975,
+ "learning_rate": 1.324952288088466e-07,
+ "loss": 0.3709,
+ "step": 1150
+ },
+ {
+ "epoch": 5.427899686520377,
+ "grad_norm": 0.5910829901695251,
+ "learning_rate": 1.3035107633230737e-07,
+ "loss": 0.3526,
+ "step": 1151
+ },
+ {
+ "epoch": 5.432601880877743,
+ "grad_norm": 0.530267059803009,
+ "learning_rate": 1.2822395046895032e-07,
+ "loss": 0.3585,
+ "step": 1152
+ },
+ {
+ "epoch": 5.43730407523511,
+ "grad_norm": 0.6139953136444092,
+ "learning_rate": 1.2611386650279167e-07,
+ "loss": 0.3681,
+ "step": 1153
+ },
+ {
+ "epoch": 5.4420062695924765,
+ "grad_norm": 0.5614138245582581,
+ "learning_rate": 1.240208395953943e-07,
+ "loss": 0.3678,
+ "step": 1154
+ },
+ {
+ "epoch": 5.446708463949843,
+ "grad_norm": 0.5488073825836182,
+ "learning_rate": 1.2194488478576266e-07,
+ "loss": 0.3942,
+ "step": 1155
+ },
+ {
+ "epoch": 5.45141065830721,
+ "grad_norm": 0.5502509474754333,
+ "learning_rate": 1.1988601699023244e-07,
+ "loss": 0.3835,
+ "step": 1156
+ },
+ {
+ "epoch": 5.456112852664576,
+ "grad_norm": 0.6090055704116821,
+ "learning_rate": 1.1784425100236419e-07,
+ "loss": 0.384,
+ "step": 1157
+ },
+ {
+ "epoch": 5.460815047021944,
+ "grad_norm": 0.5199264883995056,
+ "learning_rate": 1.1581960149283839e-07,
+ "loss": 0.3647,
+ "step": 1158
+ },
+ {
+ "epoch": 5.4655172413793105,
+ "grad_norm": 1.6897666454315186,
+ "learning_rate": 1.138120830093467e-07,
+ "loss": 0.393,
+ "step": 1159
+ },
+ {
+ "epoch": 5.470219435736677,
+ "grad_norm": 0.630703330039978,
+ "learning_rate": 1.1182170997649067e-07,
+ "loss": 0.3748,
+ "step": 1160
+ },
+ {
+ "epoch": 5.474921630094044,
+ "grad_norm": 0.586810290813446,
+ "learning_rate": 1.0984849669567616e-07,
+ "loss": 0.4071,
+ "step": 1161
+ },
+ {
+ "epoch": 5.47962382445141,
+ "grad_norm": 0.5080476999282837,
+ "learning_rate": 1.0789245734501186e-07,
+ "loss": 0.3959,
+ "step": 1162
+ },
+ {
+ "epoch": 5.484326018808777,
+ "grad_norm": 0.55291748046875,
+ "learning_rate": 1.0595360597920629e-07,
+ "loss": 0.3718,
+ "step": 1163
+ },
+ {
+ "epoch": 5.4890282131661445,
+ "grad_norm": 0.531088650226593,
+ "learning_rate": 1.0403195652946784e-07,
+ "loss": 0.3732,
+ "step": 1164
+ },
+ {
+ "epoch": 5.493730407523511,
+ "grad_norm": 0.5549778938293457,
+ "learning_rate": 1.0212752280340327e-07,
+ "loss": 0.3771,
+ "step": 1165
+ },
+ {
+ "epoch": 5.498432601880878,
+ "grad_norm": 0.5568163394927979,
+ "learning_rate": 1.0024031848492044e-07,
+ "loss": 0.3869,
+ "step": 1166
+ },
+ {
+ "epoch": 5.503134796238244,
+ "grad_norm": 0.5422462224960327,
+ "learning_rate": 9.837035713412823e-08,
+ "loss": 0.3882,
+ "step": 1167
+ },
+ {
+ "epoch": 5.507836990595611,
+ "grad_norm": 0.6083087921142578,
+ "learning_rate": 9.651765218724018e-08,
+ "loss": 0.3929,
+ "step": 1168
+ },
+ {
+ "epoch": 5.512539184952978,
+ "grad_norm": 0.5807571411132812,
+ "learning_rate": 9.468221695647789e-08,
+ "loss": 0.3656,
+ "step": 1169
+ },
+ {
+ "epoch": 5.517241379310345,
+ "grad_norm": 0.5918298959732056,
+ "learning_rate": 9.286406462997305e-08,
+ "loss": 0.3918,
+ "step": 1170
+ },
+ {
+ "epoch": 5.521943573667712,
+ "grad_norm": 0.6814099550247192,
+ "learning_rate": 9.106320827167809e-08,
+ "loss": 0.3823,
+ "step": 1171
+ },
+ {
+ "epoch": 5.5266457680250785,
+ "grad_norm": 0.5486088395118713,
+ "learning_rate": 8.927966082126566e-08,
+ "loss": 0.3904,
+ "step": 1172
+ },
+ {
+ "epoch": 5.531347962382445,
+ "grad_norm": 0.56932133436203,
+ "learning_rate": 8.75134350940407e-08,
+ "loss": 0.3649,
+ "step": 1173
+ },
+ {
+ "epoch": 5.536050156739812,
+ "grad_norm": 0.5368719696998596,
+ "learning_rate": 8.57645437808463e-08,
+ "loss": 0.387,
+ "step": 1174
+ },
+ {
+ "epoch": 5.540752351097178,
+ "grad_norm": 0.5675792694091797,
+ "learning_rate": 8.403299944797244e-08,
+ "loss": 0.3692,
+ "step": 1175
+ },
+ {
+ "epoch": 5.545454545454545,
+ "grad_norm": 0.5697962045669556,
+ "learning_rate": 8.231881453706625e-08,
+ "loss": 0.3667,
+ "step": 1176
+ },
+ {
+ "epoch": 5.5501567398119125,
+ "grad_norm": 0.6095914840698242,
+ "learning_rate": 8.062200136504217e-08,
+ "loss": 0.3774,
+ "step": 1177
+ },
+ {
+ "epoch": 5.554858934169279,
+ "grad_norm": 0.6129378080368042,
+ "learning_rate": 7.894257212399393e-08,
+ "loss": 0.3831,
+ "step": 1178
+ },
+ {
+ "epoch": 5.559561128526646,
+ "grad_norm": 0.5777211785316467,
+ "learning_rate": 7.728053888110681e-08,
+ "loss": 0.3875,
+ "step": 1179
+ },
+ {
+ "epoch": 5.564263322884012,
+ "grad_norm": 0.5230301022529602,
+ "learning_rate": 7.563591357857003e-08,
+ "loss": 0.3735,
+ "step": 1180
+ },
+ {
+ "epoch": 5.568965517241379,
+ "grad_norm": 0.9667361378669739,
+ "learning_rate": 7.40087080334928e-08,
+ "loss": 0.3597,
+ "step": 1181
+ },
+ {
+ "epoch": 5.5736677115987465,
+ "grad_norm": 0.5474180579185486,
+ "learning_rate": 7.239893393781783e-08,
+ "loss": 0.3991,
+ "step": 1182
+ },
+ {
+ "epoch": 5.578369905956113,
+ "grad_norm": 0.5471038818359375,
+ "learning_rate": 7.080660285823687e-08,
+ "loss": 0.3804,
+ "step": 1183
+ },
+ {
+ "epoch": 5.58307210031348,
+ "grad_norm": 0.5728533864021301,
+ "learning_rate": 6.923172623611057e-08,
+ "loss": 0.368,
+ "step": 1184
+ },
+ {
+ "epoch": 5.587774294670846,
+ "grad_norm": 5.423932075500488,
+ "learning_rate": 6.767431538738268e-08,
+ "loss": 0.3828,
+ "step": 1185
+ },
+ {
+ "epoch": 5.592476489028213,
+ "grad_norm": 0.5626399517059326,
+ "learning_rate": 6.613438150250062e-08,
+ "loss": 0.3563,
+ "step": 1186
+ },
+ {
+ "epoch": 5.59717868338558,
+ "grad_norm": 0.5930495262145996,
+ "learning_rate": 6.461193564633538e-08,
+ "loss": 0.3857,
+ "step": 1187
+ },
+ {
+ "epoch": 5.601880877742946,
+ "grad_norm": 0.559417724609375,
+ "learning_rate": 6.310698875810068e-08,
+ "loss": 0.3658,
+ "step": 1188
+ },
+ {
+ "epoch": 5.606583072100314,
+ "grad_norm": 0.5408678650856018,
+ "learning_rate": 6.16195516512752e-08,
+ "loss": 0.3939,
+ "step": 1189
+ },
+ {
+ "epoch": 5.61128526645768,
+ "grad_norm": 0.5929714441299438,
+ "learning_rate": 6.014963501352556e-08,
+ "loss": 0.3837,
+ "step": 1190
+ },
+ {
+ "epoch": 5.615987460815047,
+ "grad_norm": 0.5881562829017639,
+ "learning_rate": 5.8697249406627354e-08,
+ "loss": 0.3646,
+ "step": 1191
+ },
+ {
+ "epoch": 5.620689655172414,
+ "grad_norm": 0.5638775825500488,
+ "learning_rate": 5.726240526639199e-08,
+ "loss": 0.3858,
+ "step": 1192
+ },
+ {
+ "epoch": 5.62539184952978,
+ "grad_norm": 0.559077799320221,
+ "learning_rate": 5.5845112902589703e-08,
+ "loss": 0.3829,
+ "step": 1193
+ },
+ {
+ "epoch": 5.630094043887147,
+ "grad_norm": 0.5313701629638672,
+ "learning_rate": 5.44453824988761e-08,
+ "loss": 0.359,
+ "step": 1194
+ },
+ {
+ "epoch": 5.6347962382445145,
+ "grad_norm": 0.5203026533126831,
+ "learning_rate": 5.3063224112719355e-08,
+ "loss": 0.3877,
+ "step": 1195
+ },
+ {
+ "epoch": 5.639498432601881,
+ "grad_norm": 0.6288402080535889,
+ "learning_rate": 5.169864767532673e-08,
+ "loss": 0.3598,
+ "step": 1196
+ },
+ {
+ "epoch": 5.644200626959248,
+ "grad_norm": 0.5521509647369385,
+ "learning_rate": 5.0351662991575677e-08,
+ "loss": 0.3963,
+ "step": 1197
+ },
+ {
+ "epoch": 5.648902821316614,
+ "grad_norm": 0.573415994644165,
+ "learning_rate": 4.9022279739940335e-08,
+ "loss": 0.3826,
+ "step": 1198
+ },
+ {
+ "epoch": 5.653605015673981,
+ "grad_norm": 0.5620110034942627,
+ "learning_rate": 4.7710507472424336e-08,
+ "loss": 0.3684,
+ "step": 1199
+ },
+ {
+ "epoch": 5.658307210031348,
+ "grad_norm": 0.5523247122764587,
+ "learning_rate": 4.641635561449087e-08,
+ "loss": 0.3691,
+ "step": 1200
+ },
+ {
+ "epoch": 5.663009404388715,
+ "grad_norm": 0.548983633518219,
+ "learning_rate": 4.513983346499523e-08,
+ "loss": 0.387,
+ "step": 1201
+ },
+ {
+ "epoch": 5.667711598746082,
+ "grad_norm": 0.7048285603523254,
+ "learning_rate": 4.3880950196118764e-08,
+ "loss": 0.3437,
+ "step": 1202
+ },
+ {
+ "epoch": 5.672413793103448,
+ "grad_norm": 0.5626631379127502,
+ "learning_rate": 4.263971485330198e-08,
+ "loss": 0.3769,
+ "step": 1203
+ },
+ {
+ "epoch": 5.677115987460815,
+ "grad_norm": 0.6815487146377563,
+ "learning_rate": 4.141613635517988e-08,
+ "loss": 0.3747,
+ "step": 1204
+ },
+ {
+ "epoch": 5.681818181818182,
+ "grad_norm": 0.5689387321472168,
+ "learning_rate": 4.021022349351838e-08,
+ "loss": 0.3745,
+ "step": 1205
+ },
+ {
+ "epoch": 5.686520376175548,
+ "grad_norm": 0.554826557636261,
+ "learning_rate": 3.902198493314968e-08,
+ "loss": 0.3704,
+ "step": 1206
+ },
+ {
+ "epoch": 5.691222570532915,
+ "grad_norm": 0.5713619589805603,
+ "learning_rate": 3.785142921191198e-08,
+ "loss": 0.4001,
+ "step": 1207
+ },
+ {
+ "epoch": 5.695924764890282,
+ "grad_norm": 0.5539141893386841,
+ "learning_rate": 3.669856474058708e-08,
+ "loss": 0.3531,
+ "step": 1208
+ },
+ {
+ "epoch": 5.700626959247649,
+ "grad_norm": 0.5899977087974548,
+ "learning_rate": 3.556339980283929e-08,
+ "loss": 0.3827,
+ "step": 1209
+ },
+ {
+ "epoch": 5.705329153605016,
+ "grad_norm": 0.5066832304000854,
+ "learning_rate": 3.4445942555157706e-08,
+ "loss": 0.3819,
+ "step": 1210
+ },
+ {
+ "epoch": 5.710031347962382,
+ "grad_norm": 0.5226137638092041,
+ "learning_rate": 3.3346201026795696e-08,
+ "loss": 0.4003,
+ "step": 1211
+ },
+ {
+ "epoch": 5.714733542319749,
+ "grad_norm": 0.5442674160003662,
+ "learning_rate": 3.2264183119714296e-08,
+ "loss": 0.3733,
+ "step": 1212
+ },
+ {
+ "epoch": 5.7194357366771165,
+ "grad_norm": 0.606975793838501,
+ "learning_rate": 3.1199896608525014e-08,
+ "loss": 0.3858,
+ "step": 1213
+ },
+ {
+ "epoch": 5.724137931034483,
+ "grad_norm": 0.5688278675079346,
+ "learning_rate": 3.0153349140435165e-08,
+ "loss": 0.3593,
+ "step": 1214
+ },
+ {
+ "epoch": 5.72884012539185,
+ "grad_norm": 0.5659214854240417,
+ "learning_rate": 2.9124548235190397e-08,
+ "loss": 0.3511,
+ "step": 1215
+ },
+ {
+ "epoch": 5.733542319749216,
+ "grad_norm": 0.5746808648109436,
+ "learning_rate": 2.811350128502338e-08,
+ "loss": 0.3471,
+ "step": 1216
+ },
+ {
+ "epoch": 5.738244514106583,
+ "grad_norm": 0.552604079246521,
+ "learning_rate": 2.7120215554598538e-08,
+ "loss": 0.3779,
+ "step": 1217
+ },
+ {
+ "epoch": 5.74294670846395,
+ "grad_norm": 0.5258578658103943,
+ "learning_rate": 2.6144698180961548e-08,
+ "loss": 0.3763,
+ "step": 1218
+ },
+ {
+ "epoch": 5.747648902821316,
+ "grad_norm": 0.7367326021194458,
+ "learning_rate": 2.5186956173487152e-08,
+ "loss": 0.3738,
+ "step": 1219
+ },
+ {
+ "epoch": 5.752351097178684,
+ "grad_norm": 0.5616478323936462,
+ "learning_rate": 2.424699641382866e-08,
+ "loss": 0.3671,
+ "step": 1220
+ },
+ {
+ "epoch": 5.75705329153605,
+ "grad_norm": 0.5560866594314575,
+ "learning_rate": 2.33248256558688e-08,
+ "loss": 0.39,
+ "step": 1221
+ },
+ {
+ "epoch": 5.761755485893417,
+ "grad_norm": 1.5981470346450806,
+ "learning_rate": 2.2420450525671155e-08,
+ "loss": 0.3696,
+ "step": 1222
+ },
+ {
+ "epoch": 5.766457680250784,
+ "grad_norm": 0.6795075535774231,
+ "learning_rate": 2.1533877521433267e-08,
+ "loss": 0.3991,
+ "step": 1223
+ },
+ {
+ "epoch": 5.77115987460815,
+ "grad_norm": 0.5623820424079895,
+ "learning_rate": 2.066511301343832e-08,
+ "loss": 0.359,
+ "step": 1224
+ },
+ {
+ "epoch": 5.775862068965517,
+ "grad_norm": 0.5484923720359802,
+ "learning_rate": 1.9814163244010754e-08,
+ "loss": 0.3781,
+ "step": 1225
+ },
+ {
+ "epoch": 5.7805642633228835,
+ "grad_norm": 0.6006155014038086,
+ "learning_rate": 1.8981034327470727e-08,
+ "loss": 0.3867,
+ "step": 1226
+ },
+ {
+ "epoch": 5.785266457680251,
+ "grad_norm": 0.5540259480476379,
+ "learning_rate": 1.8165732250090828e-08,
+ "loss": 0.3918,
+ "step": 1227
+ },
+ {
+ "epoch": 5.789968652037618,
+ "grad_norm": 0.5383128523826599,
+ "learning_rate": 1.736826287005222e-08,
+ "loss": 0.3898,
+ "step": 1228
+ },
+ {
+ "epoch": 5.794670846394984,
+ "grad_norm": 1.3232446908950806,
+ "learning_rate": 1.6588631917403285e-08,
+ "loss": 0.3996,
+ "step": 1229
+ },
+ {
+ "epoch": 5.799373040752351,
+ "grad_norm": 0.5614281892776489,
+ "learning_rate": 1.5826844994017986e-08,
+ "loss": 0.3776,
+ "step": 1230
+ },
+ {
+ "epoch": 5.804075235109718,
+ "grad_norm": 0.566788375377655,
+ "learning_rate": 1.5082907573555906e-08,
+ "loss": 0.3842,
+ "step": 1231
+ },
+ {
+ "epoch": 5.808777429467085,
+ "grad_norm": 0.5404055714607239,
+ "learning_rate": 1.435682500142227e-08,
+ "loss": 0.3702,
+ "step": 1232
+ },
+ {
+ "epoch": 5.813479623824452,
+ "grad_norm": 0.5612157583236694,
+ "learning_rate": 1.3648602494730768e-08,
+ "loss": 0.3663,
+ "step": 1233
+ },
+ {
+ "epoch": 5.818181818181818,
+ "grad_norm": 0.6095117926597595,
+ "learning_rate": 1.2958245142265235e-08,
+ "loss": 0.4008,
+ "step": 1234
+ },
+ {
+ "epoch": 5.822884012539185,
+ "grad_norm": 0.5785005688667297,
+ "learning_rate": 1.2285757904442475e-08,
+ "loss": 0.3818,
+ "step": 1235
+ },
+ {
+ "epoch": 5.827586206896552,
+ "grad_norm": 0.5348698496818542,
+ "learning_rate": 1.1631145613278105e-08,
+ "loss": 0.3785,
+ "step": 1236
+ },
+ {
+ "epoch": 5.832288401253918,
+ "grad_norm": 0.5775424838066101,
+ "learning_rate": 1.0994412972351043e-08,
+ "loss": 0.39,
+ "step": 1237
+ },
+ {
+ "epoch": 5.836990595611285,
+ "grad_norm": 0.5630746483802795,
+ "learning_rate": 1.0375564556769357e-08,
+ "loss": 0.3973,
+ "step": 1238
+ },
+ {
+ "epoch": 5.841692789968652,
+ "grad_norm": 0.5375484824180603,
+ "learning_rate": 9.774604813138078e-09,
+ "loss": 0.3735,
+ "step": 1239
+ },
+ {
+ "epoch": 5.846394984326019,
+ "grad_norm": 0.6980341076850891,
+ "learning_rate": 9.191538059526717e-09,
+ "loss": 0.3731,
+ "step": 1240
+ },
+ {
+ "epoch": 5.851097178683386,
+ "grad_norm": 0.5714002847671509,
+ "learning_rate": 8.626368485438742e-09,
+ "loss": 0.4036,
+ "step": 1241
+ },
+ {
+ "epoch": 5.855799373040752,
+ "grad_norm": 0.7190118432044983,
+ "learning_rate": 8.07910015178104e-09,
+ "loss": 0.384,
+ "step": 1242
+ },
+ {
+ "epoch": 5.860501567398119,
+ "grad_norm": 0.6347464919090271,
+ "learning_rate": 7.549736990835054e-09,
+ "loss": 0.3788,
+ "step": 1243
+ },
+ {
+ "epoch": 5.8652037617554855,
+ "grad_norm": 0.5509764552116394,
+ "learning_rate": 7.0382828062279254e-09,
+ "loss": 0.383,
+ "step": 1244
+ },
+ {
+ "epoch": 5.869905956112853,
+ "grad_norm": 0.5286639928817749,
+ "learning_rate": 6.544741272906385e-09,
+ "loss": 0.357,
+ "step": 1245
+ },
+ {
+ "epoch": 5.87460815047022,
+ "grad_norm": 0.5207967162132263,
+ "learning_rate": 6.0691159371087386e-09,
+ "loss": 0.3645,
+ "step": 1246
+ },
+ {
+ "epoch": 5.879310344827586,
+ "grad_norm": 0.573596179485321,
+ "learning_rate": 5.611410216340984e-09,
+ "loss": 0.4065,
+ "step": 1247
+ },
+ {
+ "epoch": 5.884012539184953,
+ "grad_norm": 0.53862464427948,
+ "learning_rate": 5.171627399351009e-09,
+ "loss": 0.3817,
+ "step": 1248
+ },
+ {
+ "epoch": 5.88871473354232,
+ "grad_norm": 0.592100977897644,
+ "learning_rate": 4.749770646105822e-09,
+ "loss": 0.3963,
+ "step": 1249
+ },
+ {
+ "epoch": 5.893416927899686,
+ "grad_norm": 0.6402301788330078,
+ "learning_rate": 4.3458429877679675e-09,
+ "loss": 0.3836,
+ "step": 1250
+ },
+ {
+ "epoch": 5.898119122257054,
+ "grad_norm": 0.6455159783363342,
+ "learning_rate": 3.959847326674704e-09,
+ "loss": 0.4036,
+ "step": 1251
+ },
+ {
+ "epoch": 5.90282131661442,
+ "grad_norm": 0.6556720733642578,
+ "learning_rate": 3.591786436316358e-09,
+ "loss": 0.3548,
+ "step": 1252
+ },
+ {
+ "epoch": 5.907523510971787,
+ "grad_norm": 0.5617692470550537,
+ "learning_rate": 3.241662961317171e-09,
+ "loss": 0.3857,
+ "step": 1253
+ },
+ {
+ "epoch": 5.912225705329154,
+ "grad_norm": 0.5845247507095337,
+ "learning_rate": 2.909479417415595e-09,
+ "loss": 0.3861,
+ "step": 1254
+ },
+ {
+ "epoch": 5.91692789968652,
+ "grad_norm": 0.5122352838516235,
+ "learning_rate": 2.5952381914465253e-09,
+ "loss": 0.3832,
+ "step": 1255
+ },
+ {
+ "epoch": 5.921630094043887,
+ "grad_norm": 0.5804286599159241,
+ "learning_rate": 2.298941541323818e-09,
+ "loss": 0.3808,
+ "step": 1256
+ },
+ {
+ "epoch": 5.9263322884012535,
+ "grad_norm": 0.5455173850059509,
+ "learning_rate": 2.020591596024746e-09,
+ "loss": 0.3637,
+ "step": 1257
+ },
+ {
+ "epoch": 5.931034482758621,
+ "grad_norm": 0.5773369669914246,
+ "learning_rate": 1.7601903555744537e-09,
+ "loss": 0.3689,
+ "step": 1258
+ },
+ {
+ "epoch": 5.935736677115988,
+ "grad_norm": 0.6205880045890808,
+ "learning_rate": 1.5177396910312502e-09,
+ "loss": 0.3727,
+ "step": 1259
+ },
+ {
+ "epoch": 5.940438871473354,
+ "grad_norm": 0.5954082608222961,
+ "learning_rate": 1.2932413444727287e-09,
+ "loss": 0.3756,
+ "step": 1260
+ },
+ {
+ "epoch": 5.945141065830721,
+ "grad_norm": 0.5531405210494995,
+ "learning_rate": 1.0866969289849426e-09,
+ "loss": 0.3915,
+ "step": 1261
+ },
+ {
+ "epoch": 5.9498432601880875,
+ "grad_norm": 0.5516759753227234,
+ "learning_rate": 8.98107928649361e-10,
+ "loss": 0.3695,
+ "step": 1262
+ },
+ {
+ "epoch": 5.954545454545455,
+ "grad_norm": 0.6130419373512268,
+ "learning_rate": 7.274756985323205e-10,
+ "loss": 0.4104,
+ "step": 1263
+ },
+ {
+ "epoch": 5.959247648902822,
+ "grad_norm": 0.5638909935951233,
+ "learning_rate": 5.748014646755895e-10,
+ "loss": 0.3901,
+ "step": 1264
+ },
+ {
+ "epoch": 5.963949843260188,
+ "grad_norm": 0.5288422107696533,
+ "learning_rate": 4.4008632408831797e-10,
+ "loss": 0.3969,
+ "step": 1265
+ },
+ {
+ "epoch": 5.968652037617555,
+ "grad_norm": 0.6486741304397583,
+ "learning_rate": 3.2333124473704623e-10,
+ "loss": 0.3706,
+ "step": 1266
+ },
+ {
+ "epoch": 5.9733542319749215,
+ "grad_norm": 0.5630664825439453,
+ "learning_rate": 2.245370655409862e-10,
+ "loss": 0.3809,
+ "step": 1267
+ },
+ {
+ "epoch": 5.978056426332288,
+ "grad_norm": 0.7482160329818726,
+ "learning_rate": 1.4370449636535998e-10,
+ "loss": 0.3816,
+ "step": 1268
+ },
+ {
+ "epoch": 5.982758620689655,
+ "grad_norm": 0.6069183945655823,
+ "learning_rate": 8.083411801529384e-11,
+ "loss": 0.3906,
+ "step": 1269
+ },
+ {
+ "epoch": 5.987460815047022,
+ "grad_norm": 0.5793895721435547,
+ "learning_rate": 3.592638223220979e-11,
+ "loss": 0.3872,
+ "step": 1270
+ },
+ {
+ "epoch": 5.992163009404389,
+ "grad_norm": 0.5463269948959351,
+ "learning_rate": 8.98161169188283e-12,
+ "loss": 0.3743,
+ "step": 1271
+ },
+ {
+ "epoch": 5.996865203761756,
+ "grad_norm": 0.6062924861907959,
+ "learning_rate": 0.0,
+ "loss": 0.3788,
+ "step": 1272
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 1272,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 212,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 8.193552390105648e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-1272/training_args.bin b/checkpoint-1272/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7db90ca60ea3c300feb3b7d6e0cb54fc7cfb2060
--- /dev/null
+++ b/checkpoint-1272/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51f85402b182fc4b86518e0cb9ca9cbf150300e36000a38f53507b9a8663ad4b
+size 7928
diff --git a/checkpoint-1272/zero_to_fp32.py b/checkpoint-1272/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-1272/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-212/README.md b/checkpoint-212/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b184114a0c28ed3e4c082c18486736dc818166d
--- /dev/null
+++ b/checkpoint-212/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.3-70B-Instruct
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-212/adapter_config.json b/checkpoint-212/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc930e1be2d901773c96d6e6d186c72676cbf328
--- /dev/null
+++ b/checkpoint-212/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "up_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "q_proj",
+ "k_proj",
+ "down_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-212/adapter_model.safetensors b/checkpoint-212/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9a809d503771e48a20289225637992f9a49853e6
--- /dev/null
+++ b/checkpoint-212/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b895101dd02fc580dd030b1bba1b783b3ed757c3d94bfccfde0108bc56dff67
+size 10829849744
diff --git a/checkpoint-212/global_step212/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-212/global_step212/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..50bec7e766e2b508c0a19f9fad5f4c0b15cb16aa
--- /dev/null
+++ b/checkpoint-212/global_step212/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:966c2756d675d72b5e01ef1e586f436a7abd9cf7ecd7aae31d9ca56174d6497a
+size 21659418140
diff --git a/checkpoint-212/global_step212/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-212/global_step212/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..cac30cdf3e9254aca05fb7f18b10d70d71e41990
--- /dev/null
+++ b/checkpoint-212/global_step212/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f35fecf4323f7222fafcab635c0170803fde2dc2a631a42bc123d432c8ddd948
+size 21659457372
diff --git a/checkpoint-212/global_step212/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-212/global_step212/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5ae1daec373f32eae3b2aef537c3eb63135b7880
--- /dev/null
+++ b/checkpoint-212/global_step212/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26766b718f57c8b2fbfa44464480db2fb919dbd4127d7f2c2e4599a70757243c
+size 21659417820
diff --git a/checkpoint-212/global_step212/mp_rank_00_model_states.pt b/checkpoint-212/global_step212/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b39a3b08a20ccf98427ed946569cdee723312251
--- /dev/null
+++ b/checkpoint-212/global_step212/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ed6b29d0f6e1a5b6f8979ef194f9940b65ad4d625a9a173625488407d5e69fd
+size 11918643933
diff --git a/checkpoint-212/latest b/checkpoint-212/latest
new file mode 100644
index 0000000000000000000000000000000000000000..f28add5c66e8bb348efc349c8c3b5d2138e4e4ae
--- /dev/null
+++ b/checkpoint-212/latest
@@ -0,0 +1 @@
+global_step212
\ No newline at end of file
diff --git a/checkpoint-212/rng_state_0.pth b/checkpoint-212/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8241c78ece0cb26bfcc4a3f36ee34af6d9e1e094
--- /dev/null
+++ b/checkpoint-212/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e7d28ef5a46075665c080df045f034256a880011e1c5f34aa66e5dd2441a318
+size 14768
diff --git a/checkpoint-212/rng_state_1.pth b/checkpoint-212/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..262077a6adc15e13047d8879951bee1d4e2c7e69
--- /dev/null
+++ b/checkpoint-212/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:461dbed5b970a002786cc0d99e37e91428b03b42e8a773f86e13e8ed2d6e54e4
+size 14768
diff --git a/checkpoint-212/rng_state_2.pth b/checkpoint-212/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9346bfe2ed4ca151bfc8ba3ca3ed053af840d420
--- /dev/null
+++ b/checkpoint-212/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:632fad3a0567b603cc0ed006fa5027bc7334196edfcd5b71fb181ce9c97f7688
+size 14768
diff --git a/checkpoint-212/scheduler.pt b/checkpoint-212/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..81b90ede1acc93722fa9b4aa3dd7583bf2962e12
--- /dev/null
+++ b/checkpoint-212/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c9f5da1c8a1bbe0a369581ee4fb05317c6ef71ee5b0a863fdc62468dbff77f7
+size 1064
diff --git a/checkpoint-212/special_tokens_map.json b/checkpoint-212/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-212/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-212/tokenizer.json b/checkpoint-212/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-212/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-212/tokenizer_config.json b/checkpoint-212/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b
--- /dev/null
+++ b/checkpoint-212/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-212/trainer_state.json b/checkpoint-212/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c335bfef7e7ea139953c4e624a5b4497fbf7eed
--- /dev/null
+++ b/checkpoint-212/trainer_state.json
@@ -0,0 +1,1517 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.9968652037617555,
+ "eval_steps": 500,
+ "global_step": 212,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.004702194357366771,
+ "grad_norm": 3.1606569290161133,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 1.0072,
+ "step": 1
+ },
+ {
+ "epoch": 0.009404388714733543,
+ "grad_norm": 3.2058725357055664,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 1.0134,
+ "step": 2
+ },
+ {
+ "epoch": 0.014106583072100314,
+ "grad_norm": 2.636291265487671,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 0.9635,
+ "step": 3
+ },
+ {
+ "epoch": 0.018808777429467086,
+ "grad_norm": 2.708746910095215,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 1.0068,
+ "step": 4
+ },
+ {
+ "epoch": 0.023510971786833857,
+ "grad_norm": 2.8948426246643066,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 0.9608,
+ "step": 5
+ },
+ {
+ "epoch": 0.02821316614420063,
+ "grad_norm": 2.8740086555480957,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 0.9896,
+ "step": 6
+ },
+ {
+ "epoch": 0.032915360501567396,
+ "grad_norm": 2.8338170051574707,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 0.9098,
+ "step": 7
+ },
+ {
+ "epoch": 0.03761755485893417,
+ "grad_norm": 2.7783002853393555,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 0.9733,
+ "step": 8
+ },
+ {
+ "epoch": 0.04231974921630094,
+ "grad_norm": 3.043574333190918,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 0.9943,
+ "step": 9
+ },
+ {
+ "epoch": 0.047021943573667714,
+ "grad_norm": 3.142383337020874,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 0.9475,
+ "step": 10
+ },
+ {
+ "epoch": 0.05172413793103448,
+ "grad_norm": 2.9817280769348145,
+ "learning_rate": 5.5e-07,
+ "loss": 0.9701,
+ "step": 11
+ },
+ {
+ "epoch": 0.05642633228840126,
+ "grad_norm": 2.95699405670166,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 0.9983,
+ "step": 12
+ },
+ {
+ "epoch": 0.061128526645768025,
+ "grad_norm": 2.8782453536987305,
+ "learning_rate": 6.5e-07,
+ "loss": 0.9502,
+ "step": 13
+ },
+ {
+ "epoch": 0.06583072100313479,
+ "grad_norm": 2.6715071201324463,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 0.9436,
+ "step": 14
+ },
+ {
+ "epoch": 0.07053291536050156,
+ "grad_norm": 3.869649648666382,
+ "learning_rate": 7.5e-07,
+ "loss": 0.9692,
+ "step": 15
+ },
+ {
+ "epoch": 0.07523510971786834,
+ "grad_norm": 3.060220956802368,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 0.9258,
+ "step": 16
+ },
+ {
+ "epoch": 0.07993730407523511,
+ "grad_norm": 2.8922741413116455,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 0.9719,
+ "step": 17
+ },
+ {
+ "epoch": 0.08463949843260188,
+ "grad_norm": 2.7857820987701416,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 0.9072,
+ "step": 18
+ },
+ {
+ "epoch": 0.08934169278996865,
+ "grad_norm": 2.9753293991088867,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 0.9032,
+ "step": 19
+ },
+ {
+ "epoch": 0.09404388714733543,
+ "grad_norm": 2.7989683151245117,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 0.8887,
+ "step": 20
+ },
+ {
+ "epoch": 0.0987460815047022,
+ "grad_norm": 2.3953049182891846,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 0.8968,
+ "step": 21
+ },
+ {
+ "epoch": 0.10344827586206896,
+ "grad_norm": 2.643731117248535,
+ "learning_rate": 1.1e-06,
+ "loss": 0.8501,
+ "step": 22
+ },
+ {
+ "epoch": 0.10815047021943573,
+ "grad_norm": 2.3679006099700928,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 0.8476,
+ "step": 23
+ },
+ {
+ "epoch": 0.11285266457680251,
+ "grad_norm": 2.5935540199279785,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 0.8095,
+ "step": 24
+ },
+ {
+ "epoch": 0.11755485893416928,
+ "grad_norm": 2.510300636291504,
+ "learning_rate": 1.25e-06,
+ "loss": 0.8099,
+ "step": 25
+ },
+ {
+ "epoch": 0.12225705329153605,
+ "grad_norm": 2.372344970703125,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7869,
+ "step": 26
+ },
+ {
+ "epoch": 0.12695924764890282,
+ "grad_norm": 2.303426504135132,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.7758,
+ "step": 27
+ },
+ {
+ "epoch": 0.13166144200626959,
+ "grad_norm": 1.9017939567565918,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.7498,
+ "step": 28
+ },
+ {
+ "epoch": 0.13636363636363635,
+ "grad_norm": 1.8810580968856812,
+ "learning_rate": 1.45e-06,
+ "loss": 0.7878,
+ "step": 29
+ },
+ {
+ "epoch": 0.14106583072100312,
+ "grad_norm": 1.7797424793243408,
+ "learning_rate": 1.5e-06,
+ "loss": 0.7747,
+ "step": 30
+ },
+ {
+ "epoch": 0.14576802507836992,
+ "grad_norm": 1.5053879022598267,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.7735,
+ "step": 31
+ },
+ {
+ "epoch": 0.15047021943573669,
+ "grad_norm": 1.4909234046936035,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.7654,
+ "step": 32
+ },
+ {
+ "epoch": 0.15517241379310345,
+ "grad_norm": 1.36083984375,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.6895,
+ "step": 33
+ },
+ {
+ "epoch": 0.15987460815047022,
+ "grad_norm": 1.536014199256897,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.675,
+ "step": 34
+ },
+ {
+ "epoch": 0.164576802507837,
+ "grad_norm": 1.3426779508590698,
+ "learning_rate": 1.75e-06,
+ "loss": 0.7652,
+ "step": 35
+ },
+ {
+ "epoch": 0.16927899686520376,
+ "grad_norm": 1.4900612831115723,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.6863,
+ "step": 36
+ },
+ {
+ "epoch": 0.17398119122257052,
+ "grad_norm": 1.181241750717163,
+ "learning_rate": 1.85e-06,
+ "loss": 0.7136,
+ "step": 37
+ },
+ {
+ "epoch": 0.1786833855799373,
+ "grad_norm": 1.461419701576233,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.7606,
+ "step": 38
+ },
+ {
+ "epoch": 0.1833855799373041,
+ "grad_norm": 1.04817795753479,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.6829,
+ "step": 39
+ },
+ {
+ "epoch": 0.18808777429467086,
+ "grad_norm": 1.0499993562698364,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.7144,
+ "step": 40
+ },
+ {
+ "epoch": 0.19278996865203762,
+ "grad_norm": 0.9935064315795898,
+ "learning_rate": 2.05e-06,
+ "loss": 0.6736,
+ "step": 41
+ },
+ {
+ "epoch": 0.1974921630094044,
+ "grad_norm": 0.9919099807739258,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.7151,
+ "step": 42
+ },
+ {
+ "epoch": 0.20219435736677116,
+ "grad_norm": 0.919556200504303,
+ "learning_rate": 2.15e-06,
+ "loss": 0.6847,
+ "step": 43
+ },
+ {
+ "epoch": 0.20689655172413793,
+ "grad_norm": 1.4762015342712402,
+ "learning_rate": 2.2e-06,
+ "loss": 0.6694,
+ "step": 44
+ },
+ {
+ "epoch": 0.2115987460815047,
+ "grad_norm": 0.9243163466453552,
+ "learning_rate": 2.25e-06,
+ "loss": 0.6489,
+ "step": 45
+ },
+ {
+ "epoch": 0.21630094043887146,
+ "grad_norm": 0.7614469528198242,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.6568,
+ "step": 46
+ },
+ {
+ "epoch": 0.22100313479623823,
+ "grad_norm": 0.7543922662734985,
+ "learning_rate": 2.35e-06,
+ "loss": 0.6359,
+ "step": 47
+ },
+ {
+ "epoch": 0.22570532915360503,
+ "grad_norm": 0.7558912038803101,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.6231,
+ "step": 48
+ },
+ {
+ "epoch": 0.2304075235109718,
+ "grad_norm": 0.7822129130363464,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.6691,
+ "step": 49
+ },
+ {
+ "epoch": 0.23510971786833856,
+ "grad_norm": 0.8646999597549438,
+ "learning_rate": 2.5e-06,
+ "loss": 0.682,
+ "step": 50
+ },
+ {
+ "epoch": 0.23981191222570533,
+ "grad_norm": 0.8824774622917175,
+ "learning_rate": 2.55e-06,
+ "loss": 0.6805,
+ "step": 51
+ },
+ {
+ "epoch": 0.2445141065830721,
+ "grad_norm": 0.7697399258613586,
+ "learning_rate": 2.6e-06,
+ "loss": 0.6368,
+ "step": 52
+ },
+ {
+ "epoch": 0.24921630094043887,
+ "grad_norm": 0.6522512435913086,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.6367,
+ "step": 53
+ },
+ {
+ "epoch": 0.25391849529780564,
+ "grad_norm": 0.6172305941581726,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.6291,
+ "step": 54
+ },
+ {
+ "epoch": 0.25862068965517243,
+ "grad_norm": 0.7860460877418518,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.6736,
+ "step": 55
+ },
+ {
+ "epoch": 0.26332288401253917,
+ "grad_norm": 0.6474862694740295,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.6365,
+ "step": 56
+ },
+ {
+ "epoch": 0.26802507836990597,
+ "grad_norm": 0.6867114901542664,
+ "learning_rate": 2.85e-06,
+ "loss": 0.6397,
+ "step": 57
+ },
+ {
+ "epoch": 0.2727272727272727,
+ "grad_norm": 0.7056852579116821,
+ "learning_rate": 2.9e-06,
+ "loss": 0.6138,
+ "step": 58
+ },
+ {
+ "epoch": 0.2774294670846395,
+ "grad_norm": 0.6615664958953857,
+ "learning_rate": 2.95e-06,
+ "loss": 0.6482,
+ "step": 59
+ },
+ {
+ "epoch": 0.28213166144200624,
+ "grad_norm": 0.6649022102355957,
+ "learning_rate": 3e-06,
+ "loss": 0.6745,
+ "step": 60
+ },
+ {
+ "epoch": 0.28683385579937304,
+ "grad_norm": 0.850848913192749,
+ "learning_rate": 3.05e-06,
+ "loss": 0.5956,
+ "step": 61
+ },
+ {
+ "epoch": 0.29153605015673983,
+ "grad_norm": 0.5983562469482422,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.5894,
+ "step": 62
+ },
+ {
+ "epoch": 0.2962382445141066,
+ "grad_norm": 0.6286782622337341,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.6329,
+ "step": 63
+ },
+ {
+ "epoch": 0.30094043887147337,
+ "grad_norm": 0.5919945240020752,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.6402,
+ "step": 64
+ },
+ {
+ "epoch": 0.3056426332288401,
+ "grad_norm": 0.5632765889167786,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.5862,
+ "step": 65
+ },
+ {
+ "epoch": 0.3103448275862069,
+ "grad_norm": 0.7692590951919556,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.6031,
+ "step": 66
+ },
+ {
+ "epoch": 0.31504702194357365,
+ "grad_norm": 0.7313893437385559,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.6312,
+ "step": 67
+ },
+ {
+ "epoch": 0.31974921630094044,
+ "grad_norm": 0.6097120642662048,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.5986,
+ "step": 68
+ },
+ {
+ "epoch": 0.32445141065830724,
+ "grad_norm": 0.5853808522224426,
+ "learning_rate": 3.45e-06,
+ "loss": 0.5847,
+ "step": 69
+ },
+ {
+ "epoch": 0.329153605015674,
+ "grad_norm": 0.6093555092811584,
+ "learning_rate": 3.5e-06,
+ "loss": 0.6552,
+ "step": 70
+ },
+ {
+ "epoch": 0.3338557993730408,
+ "grad_norm": 0.6106334328651428,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.6196,
+ "step": 71
+ },
+ {
+ "epoch": 0.3385579937304075,
+ "grad_norm": 0.9254828691482544,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.6005,
+ "step": 72
+ },
+ {
+ "epoch": 0.3432601880877743,
+ "grad_norm": 0.5471694469451904,
+ "learning_rate": 3.65e-06,
+ "loss": 0.5907,
+ "step": 73
+ },
+ {
+ "epoch": 0.34796238244514105,
+ "grad_norm": 0.6204228401184082,
+ "learning_rate": 3.7e-06,
+ "loss": 0.6079,
+ "step": 74
+ },
+ {
+ "epoch": 0.35266457680250785,
+ "grad_norm": 0.52458256483078,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.6001,
+ "step": 75
+ },
+ {
+ "epoch": 0.3573667711598746,
+ "grad_norm": 0.5356763601303101,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.5987,
+ "step": 76
+ },
+ {
+ "epoch": 0.3620689655172414,
+ "grad_norm": 0.5408467054367065,
+ "learning_rate": 3.85e-06,
+ "loss": 0.6104,
+ "step": 77
+ },
+ {
+ "epoch": 0.3667711598746082,
+ "grad_norm": 0.5075871348381042,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.5569,
+ "step": 78
+ },
+ {
+ "epoch": 0.3714733542319749,
+ "grad_norm": 0.8474109768867493,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6195,
+ "step": 79
+ },
+ {
+ "epoch": 0.3761755485893417,
+ "grad_norm": 0.4750897288322449,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.5399,
+ "step": 80
+ },
+ {
+ "epoch": 0.38087774294670845,
+ "grad_norm": 0.5082002878189087,
+ "learning_rate": 4.05e-06,
+ "loss": 0.5997,
+ "step": 81
+ },
+ {
+ "epoch": 0.38557993730407525,
+ "grad_norm": 0.5343796014785767,
+ "learning_rate": 4.1e-06,
+ "loss": 0.5704,
+ "step": 82
+ },
+ {
+ "epoch": 0.390282131661442,
+ "grad_norm": 0.520311713218689,
+ "learning_rate": 4.15e-06,
+ "loss": 0.5818,
+ "step": 83
+ },
+ {
+ "epoch": 0.3949843260188088,
+ "grad_norm": 0.5292978286743164,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.5852,
+ "step": 84
+ },
+ {
+ "epoch": 0.3996865203761755,
+ "grad_norm": 0.539886474609375,
+ "learning_rate": 4.25e-06,
+ "loss": 0.6057,
+ "step": 85
+ },
+ {
+ "epoch": 0.4043887147335423,
+ "grad_norm": 0.6468827128410339,
+ "learning_rate": 4.3e-06,
+ "loss": 0.6122,
+ "step": 86
+ },
+ {
+ "epoch": 0.4090909090909091,
+ "grad_norm": 0.5537365078926086,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.5652,
+ "step": 87
+ },
+ {
+ "epoch": 0.41379310344827586,
+ "grad_norm": 0.6226018667221069,
+ "learning_rate": 4.4e-06,
+ "loss": 0.5884,
+ "step": 88
+ },
+ {
+ "epoch": 0.41849529780564265,
+ "grad_norm": 0.5016945004463196,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.5877,
+ "step": 89
+ },
+ {
+ "epoch": 0.4231974921630094,
+ "grad_norm": 0.5059167146682739,
+ "learning_rate": 4.5e-06,
+ "loss": 0.5676,
+ "step": 90
+ },
+ {
+ "epoch": 0.4278996865203762,
+ "grad_norm": 0.47521743178367615,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.5929,
+ "step": 91
+ },
+ {
+ "epoch": 0.43260188087774293,
+ "grad_norm": 0.531306266784668,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.5983,
+ "step": 92
+ },
+ {
+ "epoch": 0.4373040752351097,
+ "grad_norm": 0.4965567886829376,
+ "learning_rate": 4.65e-06,
+ "loss": 0.5279,
+ "step": 93
+ },
+ {
+ "epoch": 0.44200626959247646,
+ "grad_norm": 0.5125988125801086,
+ "learning_rate": 4.7e-06,
+ "loss": 0.5436,
+ "step": 94
+ },
+ {
+ "epoch": 0.44670846394984326,
+ "grad_norm": 0.557763934135437,
+ "learning_rate": 4.75e-06,
+ "loss": 0.5496,
+ "step": 95
+ },
+ {
+ "epoch": 0.45141065830721006,
+ "grad_norm": 0.6993274092674255,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.5498,
+ "step": 96
+ },
+ {
+ "epoch": 0.4561128526645768,
+ "grad_norm": 0.5485453009605408,
+ "learning_rate": 4.85e-06,
+ "loss": 0.5552,
+ "step": 97
+ },
+ {
+ "epoch": 0.4608150470219436,
+ "grad_norm": 1.9821522235870361,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.569,
+ "step": 98
+ },
+ {
+ "epoch": 0.46551724137931033,
+ "grad_norm": 0.6074144840240479,
+ "learning_rate": 4.95e-06,
+ "loss": 0.5546,
+ "step": 99
+ },
+ {
+ "epoch": 0.4702194357366771,
+ "grad_norm": 0.5404040813446045,
+ "learning_rate": 5e-06,
+ "loss": 0.5775,
+ "step": 100
+ },
+ {
+ "epoch": 0.47492163009404387,
+ "grad_norm": 0.500438928604126,
+ "learning_rate": 4.9999910183883085e-06,
+ "loss": 0.5569,
+ "step": 101
+ },
+ {
+ "epoch": 0.47962382445141066,
+ "grad_norm": 0.5036981701850891,
+ "learning_rate": 4.999964073617768e-06,
+ "loss": 0.5663,
+ "step": 102
+ },
+ {
+ "epoch": 0.4843260188087774,
+ "grad_norm": 0.4537642300128937,
+ "learning_rate": 4.999919165881985e-06,
+ "loss": 0.5527,
+ "step": 103
+ },
+ {
+ "epoch": 0.4890282131661442,
+ "grad_norm": 0.49653521180152893,
+ "learning_rate": 4.999856295503635e-06,
+ "loss": 0.563,
+ "step": 104
+ },
+ {
+ "epoch": 0.493730407523511,
+ "grad_norm": 0.46847566962242126,
+ "learning_rate": 4.9997754629344596e-06,
+ "loss": 0.5425,
+ "step": 105
+ },
+ {
+ "epoch": 0.49843260188087773,
+ "grad_norm": 0.5192411541938782,
+ "learning_rate": 4.999676668755263e-06,
+ "loss": 0.5315,
+ "step": 106
+ },
+ {
+ "epoch": 0.5031347962382445,
+ "grad_norm": 0.5170287489891052,
+ "learning_rate": 4.999559913675912e-06,
+ "loss": 0.5627,
+ "step": 107
+ },
+ {
+ "epoch": 0.5078369905956113,
+ "grad_norm": 0.47297438979148865,
+ "learning_rate": 4.999425198535325e-06,
+ "loss": 0.5432,
+ "step": 108
+ },
+ {
+ "epoch": 0.512539184952978,
+ "grad_norm": 0.4873776137828827,
+ "learning_rate": 4.999272524301469e-06,
+ "loss": 0.5473,
+ "step": 109
+ },
+ {
+ "epoch": 0.5172413793103449,
+ "grad_norm": 0.5432935357093811,
+ "learning_rate": 4.9991018920713505e-06,
+ "loss": 0.5642,
+ "step": 110
+ },
+ {
+ "epoch": 0.5219435736677116,
+ "grad_norm": 0.4850105345249176,
+ "learning_rate": 4.9989133030710154e-06,
+ "loss": 0.548,
+ "step": 111
+ },
+ {
+ "epoch": 0.5266457680250783,
+ "grad_norm": 0.9399585723876953,
+ "learning_rate": 4.9987067586555275e-06,
+ "loss": 0.5471,
+ "step": 112
+ },
+ {
+ "epoch": 0.5313479623824452,
+ "grad_norm": 0.5167811512947083,
+ "learning_rate": 4.998482260308969e-06,
+ "loss": 0.5648,
+ "step": 113
+ },
+ {
+ "epoch": 0.5360501567398119,
+ "grad_norm": 0.5069029927253723,
+ "learning_rate": 4.998239809644427e-06,
+ "loss": 0.5568,
+ "step": 114
+ },
+ {
+ "epoch": 0.5407523510971787,
+ "grad_norm": 0.8738563656806946,
+ "learning_rate": 4.9979794084039755e-06,
+ "loss": 0.5719,
+ "step": 115
+ },
+ {
+ "epoch": 0.5454545454545454,
+ "grad_norm": 0.5216553807258606,
+ "learning_rate": 4.997701058458677e-06,
+ "loss": 0.5309,
+ "step": 116
+ },
+ {
+ "epoch": 0.5501567398119123,
+ "grad_norm": 0.9678344130516052,
+ "learning_rate": 4.997404761808554e-06,
+ "loss": 0.5645,
+ "step": 117
+ },
+ {
+ "epoch": 0.554858934169279,
+ "grad_norm": 0.496598482131958,
+ "learning_rate": 4.9970905205825845e-06,
+ "loss": 0.5711,
+ "step": 118
+ },
+ {
+ "epoch": 0.5595611285266457,
+ "grad_norm": 0.4745199680328369,
+ "learning_rate": 4.996758337038683e-06,
+ "loss": 0.5613,
+ "step": 119
+ },
+ {
+ "epoch": 0.5642633228840125,
+ "grad_norm": 0.5595977902412415,
+ "learning_rate": 4.996408213563684e-06,
+ "loss": 0.5559,
+ "step": 120
+ },
+ {
+ "epoch": 0.5689655172413793,
+ "grad_norm": 0.4743712544441223,
+ "learning_rate": 4.996040152673326e-06,
+ "loss": 0.5228,
+ "step": 121
+ },
+ {
+ "epoch": 0.5736677115987461,
+ "grad_norm": 0.5418100953102112,
+ "learning_rate": 4.995654157012233e-06,
+ "loss": 0.536,
+ "step": 122
+ },
+ {
+ "epoch": 0.5783699059561128,
+ "grad_norm": 0.521977424621582,
+ "learning_rate": 4.995250229353895e-06,
+ "loss": 0.5305,
+ "step": 123
+ },
+ {
+ "epoch": 0.5830721003134797,
+ "grad_norm": 0.5062761902809143,
+ "learning_rate": 4.99482837260065e-06,
+ "loss": 0.5417,
+ "step": 124
+ },
+ {
+ "epoch": 0.5877742946708464,
+ "grad_norm": 0.5895913243293762,
+ "learning_rate": 4.99438858978366e-06,
+ "loss": 0.573,
+ "step": 125
+ },
+ {
+ "epoch": 0.5924764890282131,
+ "grad_norm": 0.5442466139793396,
+ "learning_rate": 4.993930884062892e-06,
+ "loss": 0.5563,
+ "step": 126
+ },
+ {
+ "epoch": 0.5971786833855799,
+ "grad_norm": 0.5130571722984314,
+ "learning_rate": 4.993455258727094e-06,
+ "loss": 0.5549,
+ "step": 127
+ },
+ {
+ "epoch": 0.6018808777429467,
+ "grad_norm": 0.5579081773757935,
+ "learning_rate": 4.992961717193773e-06,
+ "loss": 0.5554,
+ "step": 128
+ },
+ {
+ "epoch": 0.6065830721003135,
+ "grad_norm": 0.6375890374183655,
+ "learning_rate": 4.9924502630091655e-06,
+ "loss": 0.5626,
+ "step": 129
+ },
+ {
+ "epoch": 0.6112852664576802,
+ "grad_norm": 0.5129190683364868,
+ "learning_rate": 4.99192089984822e-06,
+ "loss": 0.5493,
+ "step": 130
+ },
+ {
+ "epoch": 0.6159874608150471,
+ "grad_norm": 0.5293419361114502,
+ "learning_rate": 4.9913736315145614e-06,
+ "loss": 0.5565,
+ "step": 131
+ },
+ {
+ "epoch": 0.6206896551724138,
+ "grad_norm": 0.6502572298049927,
+ "learning_rate": 4.990808461940474e-06,
+ "loss": 0.5358,
+ "step": 132
+ },
+ {
+ "epoch": 0.6253918495297806,
+ "grad_norm": 0.5450296998023987,
+ "learning_rate": 4.990225395186862e-06,
+ "loss": 0.5421,
+ "step": 133
+ },
+ {
+ "epoch": 0.6300940438871473,
+ "grad_norm": 0.45506399869918823,
+ "learning_rate": 4.9896244354432314e-06,
+ "loss": 0.5396,
+ "step": 134
+ },
+ {
+ "epoch": 0.6347962382445141,
+ "grad_norm": 0.5095545649528503,
+ "learning_rate": 4.98900558702765e-06,
+ "loss": 0.5486,
+ "step": 135
+ },
+ {
+ "epoch": 0.6394984326018809,
+ "grad_norm": 0.4836446940898895,
+ "learning_rate": 4.9883688543867225e-06,
+ "loss": 0.5596,
+ "step": 136
+ },
+ {
+ "epoch": 0.6442006269592476,
+ "grad_norm": 0.5253512859344482,
+ "learning_rate": 4.987714242095558e-06,
+ "loss": 0.5308,
+ "step": 137
+ },
+ {
+ "epoch": 0.6489028213166145,
+ "grad_norm": 0.8280164003372192,
+ "learning_rate": 4.9870417548577355e-06,
+ "loss": 0.5349,
+ "step": 138
+ },
+ {
+ "epoch": 0.6536050156739812,
+ "grad_norm": 0.4729730188846588,
+ "learning_rate": 4.9863513975052696e-06,
+ "loss": 0.5416,
+ "step": 139
+ },
+ {
+ "epoch": 0.658307210031348,
+ "grad_norm": 0.5932718515396118,
+ "learning_rate": 4.985643174998578e-06,
+ "loss": 0.5638,
+ "step": 140
+ },
+ {
+ "epoch": 0.6630094043887147,
+ "grad_norm": 0.5187026262283325,
+ "learning_rate": 4.984917092426445e-06,
+ "loss": 0.5507,
+ "step": 141
+ },
+ {
+ "epoch": 0.6677115987460815,
+ "grad_norm": 0.5024245977401733,
+ "learning_rate": 4.984173155005982e-06,
+ "loss": 0.5406,
+ "step": 142
+ },
+ {
+ "epoch": 0.6724137931034483,
+ "grad_norm": 0.4735509157180786,
+ "learning_rate": 4.983411368082597e-06,
+ "loss": 0.5431,
+ "step": 143
+ },
+ {
+ "epoch": 0.677115987460815,
+ "grad_norm": 0.5040024518966675,
+ "learning_rate": 4.982631737129948e-06,
+ "loss": 0.5291,
+ "step": 144
+ },
+ {
+ "epoch": 0.6818181818181818,
+ "grad_norm": 0.47764894366264343,
+ "learning_rate": 4.98183426774991e-06,
+ "loss": 0.5677,
+ "step": 145
+ },
+ {
+ "epoch": 0.6865203761755486,
+ "grad_norm": 0.5211489796638489,
+ "learning_rate": 4.981018965672529e-06,
+ "loss": 0.566,
+ "step": 146
+ },
+ {
+ "epoch": 0.6912225705329154,
+ "grad_norm": 1.022007942199707,
+ "learning_rate": 4.98018583675599e-06,
+ "loss": 0.5476,
+ "step": 147
+ },
+ {
+ "epoch": 0.6959247648902821,
+ "grad_norm": 0.5263912677764893,
+ "learning_rate": 4.979334886986562e-06,
+ "loss": 0.5473,
+ "step": 148
+ },
+ {
+ "epoch": 0.700626959247649,
+ "grad_norm": 0.5014091730117798,
+ "learning_rate": 4.978466122478567e-06,
+ "loss": 0.5642,
+ "step": 149
+ },
+ {
+ "epoch": 0.7053291536050157,
+ "grad_norm": 0.5003350973129272,
+ "learning_rate": 4.97757954947433e-06,
+ "loss": 0.5311,
+ "step": 150
+ },
+ {
+ "epoch": 0.7100313479623824,
+ "grad_norm": 0.5010690093040466,
+ "learning_rate": 4.976675174344132e-06,
+ "loss": 0.5469,
+ "step": 151
+ },
+ {
+ "epoch": 0.7147335423197492,
+ "grad_norm": 0.45779237151145935,
+ "learning_rate": 4.975753003586172e-06,
+ "loss": 0.5273,
+ "step": 152
+ },
+ {
+ "epoch": 0.719435736677116,
+ "grad_norm": 0.6231539845466614,
+ "learning_rate": 4.974813043826513e-06,
+ "loss": 0.5182,
+ "step": 153
+ },
+ {
+ "epoch": 0.7241379310344828,
+ "grad_norm": 0.5361394286155701,
+ "learning_rate": 4.973855301819039e-06,
+ "loss": 0.5372,
+ "step": 154
+ },
+ {
+ "epoch": 0.7288401253918495,
+ "grad_norm": 0.5193538665771484,
+ "learning_rate": 4.972879784445402e-06,
+ "loss": 0.5201,
+ "step": 155
+ },
+ {
+ "epoch": 0.7335423197492164,
+ "grad_norm": 0.47956809401512146,
+ "learning_rate": 4.971886498714978e-06,
+ "loss": 0.5402,
+ "step": 156
+ },
+ {
+ "epoch": 0.7382445141065831,
+ "grad_norm": 0.5303016901016235,
+ "learning_rate": 4.97087545176481e-06,
+ "loss": 0.5174,
+ "step": 157
+ },
+ {
+ "epoch": 0.7429467084639498,
+ "grad_norm": 0.5002286434173584,
+ "learning_rate": 4.9698466508595655e-06,
+ "loss": 0.5453,
+ "step": 158
+ },
+ {
+ "epoch": 0.7476489028213166,
+ "grad_norm": 0.6070297360420227,
+ "learning_rate": 4.9688001033914756e-06,
+ "loss": 0.5327,
+ "step": 159
+ },
+ {
+ "epoch": 0.7523510971786834,
+ "grad_norm": 0.5436793565750122,
+ "learning_rate": 4.967735816880286e-06,
+ "loss": 0.544,
+ "step": 160
+ },
+ {
+ "epoch": 0.7570532915360502,
+ "grad_norm": 0.538012683391571,
+ "learning_rate": 4.966653798973205e-06,
+ "loss": 0.5233,
+ "step": 161
+ },
+ {
+ "epoch": 0.7617554858934169,
+ "grad_norm": 0.4916169345378876,
+ "learning_rate": 4.965554057444842e-06,
+ "loss": 0.5168,
+ "step": 162
+ },
+ {
+ "epoch": 0.7664576802507836,
+ "grad_norm": 0.48281437158584595,
+ "learning_rate": 4.964436600197161e-06,
+ "loss": 0.5393,
+ "step": 163
+ },
+ {
+ "epoch": 0.7711598746081505,
+ "grad_norm": 0.5184990167617798,
+ "learning_rate": 4.963301435259413e-06,
+ "loss": 0.5085,
+ "step": 164
+ },
+ {
+ "epoch": 0.7758620689655172,
+ "grad_norm": 0.4706438183784485,
+ "learning_rate": 4.962148570788088e-06,
+ "loss": 0.5299,
+ "step": 165
+ },
+ {
+ "epoch": 0.780564263322884,
+ "grad_norm": 0.6550764441490173,
+ "learning_rate": 4.96097801506685e-06,
+ "loss": 0.5192,
+ "step": 166
+ },
+ {
+ "epoch": 0.7852664576802508,
+ "grad_norm": 0.5386581420898438,
+ "learning_rate": 4.959789776506482e-06,
+ "loss": 0.5258,
+ "step": 167
+ },
+ {
+ "epoch": 0.7899686520376176,
+ "grad_norm": 0.5060779452323914,
+ "learning_rate": 4.958583863644821e-06,
+ "loss": 0.5512,
+ "step": 168
+ },
+ {
+ "epoch": 0.7946708463949843,
+ "grad_norm": 0.47050032019615173,
+ "learning_rate": 4.9573602851466985e-06,
+ "loss": 0.5176,
+ "step": 169
+ },
+ {
+ "epoch": 0.799373040752351,
+ "grad_norm": 7.3139567375183105,
+ "learning_rate": 4.9561190498038815e-06,
+ "loss": 0.5381,
+ "step": 170
+ },
+ {
+ "epoch": 0.8040752351097179,
+ "grad_norm": 0.620528519153595,
+ "learning_rate": 4.954860166535005e-06,
+ "loss": 0.5299,
+ "step": 171
+ },
+ {
+ "epoch": 0.8087774294670846,
+ "grad_norm": 0.45067766308784485,
+ "learning_rate": 4.95358364438551e-06,
+ "loss": 0.5328,
+ "step": 172
+ },
+ {
+ "epoch": 0.8134796238244514,
+ "grad_norm": 0.6771508455276489,
+ "learning_rate": 4.952289492527576e-06,
+ "loss": 0.5601,
+ "step": 173
+ },
+ {
+ "epoch": 0.8181818181818182,
+ "grad_norm": 0.518925130367279,
+ "learning_rate": 4.9509777202600605e-06,
+ "loss": 0.494,
+ "step": 174
+ },
+ {
+ "epoch": 0.822884012539185,
+ "grad_norm": 0.5191988945007324,
+ "learning_rate": 4.949648337008425e-06,
+ "loss": 0.5425,
+ "step": 175
+ },
+ {
+ "epoch": 0.8275862068965517,
+ "grad_norm": 0.8600963354110718,
+ "learning_rate": 4.948301352324674e-06,
+ "loss": 0.5332,
+ "step": 176
+ },
+ {
+ "epoch": 0.8322884012539185,
+ "grad_norm": 0.5405915379524231,
+ "learning_rate": 4.946936775887281e-06,
+ "loss": 0.5276,
+ "step": 177
+ },
+ {
+ "epoch": 0.8369905956112853,
+ "grad_norm": 0.48730772733688354,
+ "learning_rate": 4.945554617501124e-06,
+ "loss": 0.5217,
+ "step": 178
+ },
+ {
+ "epoch": 0.841692789968652,
+ "grad_norm": 0.5092865824699402,
+ "learning_rate": 4.944154887097411e-06,
+ "loss": 0.5534,
+ "step": 179
+ },
+ {
+ "epoch": 0.8463949843260188,
+ "grad_norm": 0.4994933605194092,
+ "learning_rate": 4.942737594733608e-06,
+ "loss": 0.5242,
+ "step": 180
+ },
+ {
+ "epoch": 0.8510971786833855,
+ "grad_norm": 0.4554043412208557,
+ "learning_rate": 4.941302750593373e-06,
+ "loss": 0.5424,
+ "step": 181
+ },
+ {
+ "epoch": 0.8557993730407524,
+ "grad_norm": 0.4865265488624573,
+ "learning_rate": 4.939850364986475e-06,
+ "loss": 0.482,
+ "step": 182
+ },
+ {
+ "epoch": 0.8605015673981191,
+ "grad_norm": 0.5013875365257263,
+ "learning_rate": 4.938380448348725e-06,
+ "loss": 0.4908,
+ "step": 183
+ },
+ {
+ "epoch": 0.8652037617554859,
+ "grad_norm": 0.4997917115688324,
+ "learning_rate": 4.9368930112419e-06,
+ "loss": 0.5336,
+ "step": 184
+ },
+ {
+ "epoch": 0.8699059561128527,
+ "grad_norm": 0.4783482551574707,
+ "learning_rate": 4.935388064353665e-06,
+ "loss": 0.5338,
+ "step": 185
+ },
+ {
+ "epoch": 0.8746081504702194,
+ "grad_norm": 0.7221089005470276,
+ "learning_rate": 4.9338656184975e-06,
+ "loss": 0.5327,
+ "step": 186
+ },
+ {
+ "epoch": 0.8793103448275862,
+ "grad_norm": 0.48115843534469604,
+ "learning_rate": 4.932325684612618e-06,
+ "loss": 0.5408,
+ "step": 187
+ },
+ {
+ "epoch": 0.8840125391849529,
+ "grad_norm": 0.4940219223499298,
+ "learning_rate": 4.93076827376389e-06,
+ "loss": 0.5455,
+ "step": 188
+ },
+ {
+ "epoch": 0.8887147335423198,
+ "grad_norm": 0.4754747450351715,
+ "learning_rate": 4.9291933971417635e-06,
+ "loss": 0.542,
+ "step": 189
+ },
+ {
+ "epoch": 0.8934169278996865,
+ "grad_norm": 0.548713207244873,
+ "learning_rate": 4.9276010660621835e-06,
+ "loss": 0.5292,
+ "step": 190
+ },
+ {
+ "epoch": 0.8981191222570533,
+ "grad_norm": 0.7292612195014954,
+ "learning_rate": 4.925991291966508e-06,
+ "loss": 0.5073,
+ "step": 191
+ },
+ {
+ "epoch": 0.9028213166144201,
+ "grad_norm": 0.5254770517349243,
+ "learning_rate": 4.92436408642143e-06,
+ "loss": 0.5451,
+ "step": 192
+ },
+ {
+ "epoch": 0.9075235109717869,
+ "grad_norm": 0.47938767075538635,
+ "learning_rate": 4.9227194611188934e-06,
+ "loss": 0.5204,
+ "step": 193
+ },
+ {
+ "epoch": 0.9122257053291536,
+ "grad_norm": 0.6740232706069946,
+ "learning_rate": 4.921057427876007e-06,
+ "loss": 0.4928,
+ "step": 194
+ },
+ {
+ "epoch": 0.9169278996865203,
+ "grad_norm": 0.5455343723297119,
+ "learning_rate": 4.919377998634959e-06,
+ "loss": 0.5468,
+ "step": 195
+ },
+ {
+ "epoch": 0.9216300940438872,
+ "grad_norm": 0.5001958012580872,
+ "learning_rate": 4.917681185462934e-06,
+ "loss": 0.5339,
+ "step": 196
+ },
+ {
+ "epoch": 0.9263322884012539,
+ "grad_norm": 0.5084257125854492,
+ "learning_rate": 4.915967000552028e-06,
+ "loss": 0.5259,
+ "step": 197
+ },
+ {
+ "epoch": 0.9310344827586207,
+ "grad_norm": 0.4807164967060089,
+ "learning_rate": 4.914235456219154e-06,
+ "loss": 0.5204,
+ "step": 198
+ },
+ {
+ "epoch": 0.9357366771159875,
+ "grad_norm": 0.6099370718002319,
+ "learning_rate": 4.912486564905959e-06,
+ "loss": 0.544,
+ "step": 199
+ },
+ {
+ "epoch": 0.9404388714733543,
+ "grad_norm": 0.47461947798728943,
+ "learning_rate": 4.910720339178735e-06,
+ "loss": 0.5295,
+ "step": 200
+ },
+ {
+ "epoch": 0.945141065830721,
+ "grad_norm": 0.500136137008667,
+ "learning_rate": 4.908936791728323e-06,
+ "loss": 0.5321,
+ "step": 201
+ },
+ {
+ "epoch": 0.9498432601880877,
+ "grad_norm": 0.5235631465911865,
+ "learning_rate": 4.907135935370027e-06,
+ "loss": 0.5338,
+ "step": 202
+ },
+ {
+ "epoch": 0.9545454545454546,
+ "grad_norm": 0.9285804629325867,
+ "learning_rate": 4.905317783043523e-06,
+ "loss": 0.5393,
+ "step": 203
+ },
+ {
+ "epoch": 0.9592476489028213,
+ "grad_norm": 0.4834178388118744,
+ "learning_rate": 4.9034823478127605e-06,
+ "loss": 0.5211,
+ "step": 204
+ },
+ {
+ "epoch": 0.9639498432601881,
+ "grad_norm": 0.4830580949783325,
+ "learning_rate": 4.901629642865872e-06,
+ "loss": 0.4986,
+ "step": 205
+ },
+ {
+ "epoch": 0.9686520376175548,
+ "grad_norm": 0.49718615412712097,
+ "learning_rate": 4.89975968151508e-06,
+ "loss": 0.5204,
+ "step": 206
+ },
+ {
+ "epoch": 0.9733542319749217,
+ "grad_norm": 0.5056726336479187,
+ "learning_rate": 4.8978724771965965e-06,
+ "loss": 0.5133,
+ "step": 207
+ },
+ {
+ "epoch": 0.9780564263322884,
+ "grad_norm": 0.7357563376426697,
+ "learning_rate": 4.895968043470532e-06,
+ "loss": 0.5307,
+ "step": 208
+ },
+ {
+ "epoch": 0.9827586206896551,
+ "grad_norm": 0.515610933303833,
+ "learning_rate": 4.894046394020794e-06,
+ "loss": 0.4955,
+ "step": 209
+ },
+ {
+ "epoch": 0.987460815047022,
+ "grad_norm": 0.5124618411064148,
+ "learning_rate": 4.892107542654988e-06,
+ "loss": 0.526,
+ "step": 210
+ },
+ {
+ "epoch": 0.9921630094043887,
+ "grad_norm": 0.5059565901756287,
+ "learning_rate": 4.890151503304325e-06,
+ "loss": 0.5473,
+ "step": 211
+ },
+ {
+ "epoch": 0.9968652037617555,
+ "grad_norm": 0.4806717336177826,
+ "learning_rate": 4.88817829002351e-06,
+ "loss": 0.5212,
+ "step": 212
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 1272,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 212,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.3638348518765625e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-212/training_args.bin b/checkpoint-212/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7db90ca60ea3c300feb3b7d6e0cb54fc7cfb2060
--- /dev/null
+++ b/checkpoint-212/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51f85402b182fc4b86518e0cb9ca9cbf150300e36000a38f53507b9a8663ad4b
+size 7928
diff --git a/checkpoint-212/zero_to_fp32.py b/checkpoint-212/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-212/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-424/README.md b/checkpoint-424/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b184114a0c28ed3e4c082c18486736dc818166d
--- /dev/null
+++ b/checkpoint-424/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.3-70B-Instruct
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-424/adapter_config.json b/checkpoint-424/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc930e1be2d901773c96d6e6d186c72676cbf328
--- /dev/null
+++ b/checkpoint-424/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "up_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "q_proj",
+ "k_proj",
+ "down_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-424/adapter_model.safetensors b/checkpoint-424/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..70c1f296eae9aa12542420b7d9698065dec9abec
--- /dev/null
+++ b/checkpoint-424/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:231d43a9c50a128cd887cd13b915dd412b9501ca4233ded1776f57b5e7c66bb3
+size 10829849744
diff --git a/checkpoint-424/global_step424/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-424/global_step424/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a2a5feeafcf6a5c11d61e0d4e462bf23db68b09d
--- /dev/null
+++ b/checkpoint-424/global_step424/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8edd52e273bc55ea16fe995787c0a25eca6598b3922068bbe3f7de4e44237f82
+size 21659418140
diff --git a/checkpoint-424/global_step424/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-424/global_step424/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..19f9aa32fc7ce92b3c93ebf656a0ce0bdd2c3d17
--- /dev/null
+++ b/checkpoint-424/global_step424/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c2335c0c38a7fe8cbe03f287d106c466dc7ea9ad932686df1ab8c4818908d7e
+size 21659457372
diff --git a/checkpoint-424/global_step424/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-424/global_step424/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..95bccca761678b8ee9ea3f05f4425e7294e52d2f
--- /dev/null
+++ b/checkpoint-424/global_step424/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4653a6a5fe2a9aa311b284f75941b1b82e12aaf14294a26071e13d07db9a0912
+size 21659417820
diff --git a/checkpoint-424/global_step424/mp_rank_00_model_states.pt b/checkpoint-424/global_step424/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..731e8ab7905cb6279d5ffb56c894b5688cb706e1
--- /dev/null
+++ b/checkpoint-424/global_step424/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3ab40c9d5240d9bf46f4b18144f5acabb3961599aa82c21ec1d3381941cd210
+size 11918643933
diff --git a/checkpoint-424/latest b/checkpoint-424/latest
new file mode 100644
index 0000000000000000000000000000000000000000..61139b6d89ae00cc09dfb55f822bf31877673d67
--- /dev/null
+++ b/checkpoint-424/latest
@@ -0,0 +1 @@
+global_step424
\ No newline at end of file
diff --git a/checkpoint-424/rng_state_0.pth b/checkpoint-424/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..aba0404f2e69e7d82296d288a3d126d49db8c6ab
--- /dev/null
+++ b/checkpoint-424/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82e5a32a087e49f83e90a35631782fb8d2f2d30e88836e47f6ede6b9024c503b
+size 14768
diff --git a/checkpoint-424/rng_state_1.pth b/checkpoint-424/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c8b285de4dfd46e59bd30a0bf2a18e5906d2e6b5
--- /dev/null
+++ b/checkpoint-424/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3cfb9e4be244058513b077bb5da890bcf453feff8785721e7be652bca9851ff
+size 14768
diff --git a/checkpoint-424/rng_state_2.pth b/checkpoint-424/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3188ac5d63a9db2e2f18fcd45e3f8bc25df3b390
--- /dev/null
+++ b/checkpoint-424/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3692ab5f6c663f7b24a8011babeca9adf6d43f9f56ad6debc32b019389fbe70
+size 14768
diff --git a/checkpoint-424/scheduler.pt b/checkpoint-424/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2077e244a4e994237918818c81c125bbb50e58b6
--- /dev/null
+++ b/checkpoint-424/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52cf95ba13b0a43ab308e661ee3f7d0701396ebbcb97885c4401415ed201f8fd
+size 1064
diff --git a/checkpoint-424/special_tokens_map.json b/checkpoint-424/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-424/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-424/tokenizer.json b/checkpoint-424/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-424/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-424/tokenizer_config.json b/checkpoint-424/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b
--- /dev/null
+++ b/checkpoint-424/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-424/trainer_state.json b/checkpoint-424/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6ef35607e1e6d94eac1e364fde9d01c1d9fa1b29
--- /dev/null
+++ b/checkpoint-424/trainer_state.json
@@ -0,0 +1,3001 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.9968652037617556,
+ "eval_steps": 500,
+ "global_step": 424,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.004702194357366771,
+ "grad_norm": 3.1606569290161133,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 1.0072,
+ "step": 1
+ },
+ {
+ "epoch": 0.009404388714733543,
+ "grad_norm": 3.2058725357055664,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 1.0134,
+ "step": 2
+ },
+ {
+ "epoch": 0.014106583072100314,
+ "grad_norm": 2.636291265487671,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 0.9635,
+ "step": 3
+ },
+ {
+ "epoch": 0.018808777429467086,
+ "grad_norm": 2.708746910095215,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 1.0068,
+ "step": 4
+ },
+ {
+ "epoch": 0.023510971786833857,
+ "grad_norm": 2.8948426246643066,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 0.9608,
+ "step": 5
+ },
+ {
+ "epoch": 0.02821316614420063,
+ "grad_norm": 2.8740086555480957,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 0.9896,
+ "step": 6
+ },
+ {
+ "epoch": 0.032915360501567396,
+ "grad_norm": 2.8338170051574707,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 0.9098,
+ "step": 7
+ },
+ {
+ "epoch": 0.03761755485893417,
+ "grad_norm": 2.7783002853393555,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 0.9733,
+ "step": 8
+ },
+ {
+ "epoch": 0.04231974921630094,
+ "grad_norm": 3.043574333190918,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 0.9943,
+ "step": 9
+ },
+ {
+ "epoch": 0.047021943573667714,
+ "grad_norm": 3.142383337020874,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 0.9475,
+ "step": 10
+ },
+ {
+ "epoch": 0.05172413793103448,
+ "grad_norm": 2.9817280769348145,
+ "learning_rate": 5.5e-07,
+ "loss": 0.9701,
+ "step": 11
+ },
+ {
+ "epoch": 0.05642633228840126,
+ "grad_norm": 2.95699405670166,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 0.9983,
+ "step": 12
+ },
+ {
+ "epoch": 0.061128526645768025,
+ "grad_norm": 2.8782453536987305,
+ "learning_rate": 6.5e-07,
+ "loss": 0.9502,
+ "step": 13
+ },
+ {
+ "epoch": 0.06583072100313479,
+ "grad_norm": 2.6715071201324463,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 0.9436,
+ "step": 14
+ },
+ {
+ "epoch": 0.07053291536050156,
+ "grad_norm": 3.869649648666382,
+ "learning_rate": 7.5e-07,
+ "loss": 0.9692,
+ "step": 15
+ },
+ {
+ "epoch": 0.07523510971786834,
+ "grad_norm": 3.060220956802368,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 0.9258,
+ "step": 16
+ },
+ {
+ "epoch": 0.07993730407523511,
+ "grad_norm": 2.8922741413116455,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 0.9719,
+ "step": 17
+ },
+ {
+ "epoch": 0.08463949843260188,
+ "grad_norm": 2.7857820987701416,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 0.9072,
+ "step": 18
+ },
+ {
+ "epoch": 0.08934169278996865,
+ "grad_norm": 2.9753293991088867,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 0.9032,
+ "step": 19
+ },
+ {
+ "epoch": 0.09404388714733543,
+ "grad_norm": 2.7989683151245117,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 0.8887,
+ "step": 20
+ },
+ {
+ "epoch": 0.0987460815047022,
+ "grad_norm": 2.3953049182891846,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 0.8968,
+ "step": 21
+ },
+ {
+ "epoch": 0.10344827586206896,
+ "grad_norm": 2.643731117248535,
+ "learning_rate": 1.1e-06,
+ "loss": 0.8501,
+ "step": 22
+ },
+ {
+ "epoch": 0.10815047021943573,
+ "grad_norm": 2.3679006099700928,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 0.8476,
+ "step": 23
+ },
+ {
+ "epoch": 0.11285266457680251,
+ "grad_norm": 2.5935540199279785,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 0.8095,
+ "step": 24
+ },
+ {
+ "epoch": 0.11755485893416928,
+ "grad_norm": 2.510300636291504,
+ "learning_rate": 1.25e-06,
+ "loss": 0.8099,
+ "step": 25
+ },
+ {
+ "epoch": 0.12225705329153605,
+ "grad_norm": 2.372344970703125,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7869,
+ "step": 26
+ },
+ {
+ "epoch": 0.12695924764890282,
+ "grad_norm": 2.303426504135132,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.7758,
+ "step": 27
+ },
+ {
+ "epoch": 0.13166144200626959,
+ "grad_norm": 1.9017939567565918,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.7498,
+ "step": 28
+ },
+ {
+ "epoch": 0.13636363636363635,
+ "grad_norm": 1.8810580968856812,
+ "learning_rate": 1.45e-06,
+ "loss": 0.7878,
+ "step": 29
+ },
+ {
+ "epoch": 0.14106583072100312,
+ "grad_norm": 1.7797424793243408,
+ "learning_rate": 1.5e-06,
+ "loss": 0.7747,
+ "step": 30
+ },
+ {
+ "epoch": 0.14576802507836992,
+ "grad_norm": 1.5053879022598267,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.7735,
+ "step": 31
+ },
+ {
+ "epoch": 0.15047021943573669,
+ "grad_norm": 1.4909234046936035,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.7654,
+ "step": 32
+ },
+ {
+ "epoch": 0.15517241379310345,
+ "grad_norm": 1.36083984375,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.6895,
+ "step": 33
+ },
+ {
+ "epoch": 0.15987460815047022,
+ "grad_norm": 1.536014199256897,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.675,
+ "step": 34
+ },
+ {
+ "epoch": 0.164576802507837,
+ "grad_norm": 1.3426779508590698,
+ "learning_rate": 1.75e-06,
+ "loss": 0.7652,
+ "step": 35
+ },
+ {
+ "epoch": 0.16927899686520376,
+ "grad_norm": 1.4900612831115723,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.6863,
+ "step": 36
+ },
+ {
+ "epoch": 0.17398119122257052,
+ "grad_norm": 1.181241750717163,
+ "learning_rate": 1.85e-06,
+ "loss": 0.7136,
+ "step": 37
+ },
+ {
+ "epoch": 0.1786833855799373,
+ "grad_norm": 1.461419701576233,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.7606,
+ "step": 38
+ },
+ {
+ "epoch": 0.1833855799373041,
+ "grad_norm": 1.04817795753479,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.6829,
+ "step": 39
+ },
+ {
+ "epoch": 0.18808777429467086,
+ "grad_norm": 1.0499993562698364,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.7144,
+ "step": 40
+ },
+ {
+ "epoch": 0.19278996865203762,
+ "grad_norm": 0.9935064315795898,
+ "learning_rate": 2.05e-06,
+ "loss": 0.6736,
+ "step": 41
+ },
+ {
+ "epoch": 0.1974921630094044,
+ "grad_norm": 0.9919099807739258,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.7151,
+ "step": 42
+ },
+ {
+ "epoch": 0.20219435736677116,
+ "grad_norm": 0.919556200504303,
+ "learning_rate": 2.15e-06,
+ "loss": 0.6847,
+ "step": 43
+ },
+ {
+ "epoch": 0.20689655172413793,
+ "grad_norm": 1.4762015342712402,
+ "learning_rate": 2.2e-06,
+ "loss": 0.6694,
+ "step": 44
+ },
+ {
+ "epoch": 0.2115987460815047,
+ "grad_norm": 0.9243163466453552,
+ "learning_rate": 2.25e-06,
+ "loss": 0.6489,
+ "step": 45
+ },
+ {
+ "epoch": 0.21630094043887146,
+ "grad_norm": 0.7614469528198242,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.6568,
+ "step": 46
+ },
+ {
+ "epoch": 0.22100313479623823,
+ "grad_norm": 0.7543922662734985,
+ "learning_rate": 2.35e-06,
+ "loss": 0.6359,
+ "step": 47
+ },
+ {
+ "epoch": 0.22570532915360503,
+ "grad_norm": 0.7558912038803101,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.6231,
+ "step": 48
+ },
+ {
+ "epoch": 0.2304075235109718,
+ "grad_norm": 0.7822129130363464,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.6691,
+ "step": 49
+ },
+ {
+ "epoch": 0.23510971786833856,
+ "grad_norm": 0.8646999597549438,
+ "learning_rate": 2.5e-06,
+ "loss": 0.682,
+ "step": 50
+ },
+ {
+ "epoch": 0.23981191222570533,
+ "grad_norm": 0.8824774622917175,
+ "learning_rate": 2.55e-06,
+ "loss": 0.6805,
+ "step": 51
+ },
+ {
+ "epoch": 0.2445141065830721,
+ "grad_norm": 0.7697399258613586,
+ "learning_rate": 2.6e-06,
+ "loss": 0.6368,
+ "step": 52
+ },
+ {
+ "epoch": 0.24921630094043887,
+ "grad_norm": 0.6522512435913086,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.6367,
+ "step": 53
+ },
+ {
+ "epoch": 0.25391849529780564,
+ "grad_norm": 0.6172305941581726,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.6291,
+ "step": 54
+ },
+ {
+ "epoch": 0.25862068965517243,
+ "grad_norm": 0.7860460877418518,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.6736,
+ "step": 55
+ },
+ {
+ "epoch": 0.26332288401253917,
+ "grad_norm": 0.6474862694740295,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.6365,
+ "step": 56
+ },
+ {
+ "epoch": 0.26802507836990597,
+ "grad_norm": 0.6867114901542664,
+ "learning_rate": 2.85e-06,
+ "loss": 0.6397,
+ "step": 57
+ },
+ {
+ "epoch": 0.2727272727272727,
+ "grad_norm": 0.7056852579116821,
+ "learning_rate": 2.9e-06,
+ "loss": 0.6138,
+ "step": 58
+ },
+ {
+ "epoch": 0.2774294670846395,
+ "grad_norm": 0.6615664958953857,
+ "learning_rate": 2.95e-06,
+ "loss": 0.6482,
+ "step": 59
+ },
+ {
+ "epoch": 0.28213166144200624,
+ "grad_norm": 0.6649022102355957,
+ "learning_rate": 3e-06,
+ "loss": 0.6745,
+ "step": 60
+ },
+ {
+ "epoch": 0.28683385579937304,
+ "grad_norm": 0.850848913192749,
+ "learning_rate": 3.05e-06,
+ "loss": 0.5956,
+ "step": 61
+ },
+ {
+ "epoch": 0.29153605015673983,
+ "grad_norm": 0.5983562469482422,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.5894,
+ "step": 62
+ },
+ {
+ "epoch": 0.2962382445141066,
+ "grad_norm": 0.6286782622337341,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.6329,
+ "step": 63
+ },
+ {
+ "epoch": 0.30094043887147337,
+ "grad_norm": 0.5919945240020752,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.6402,
+ "step": 64
+ },
+ {
+ "epoch": 0.3056426332288401,
+ "grad_norm": 0.5632765889167786,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.5862,
+ "step": 65
+ },
+ {
+ "epoch": 0.3103448275862069,
+ "grad_norm": 0.7692590951919556,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.6031,
+ "step": 66
+ },
+ {
+ "epoch": 0.31504702194357365,
+ "grad_norm": 0.7313893437385559,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.6312,
+ "step": 67
+ },
+ {
+ "epoch": 0.31974921630094044,
+ "grad_norm": 0.6097120642662048,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.5986,
+ "step": 68
+ },
+ {
+ "epoch": 0.32445141065830724,
+ "grad_norm": 0.5853808522224426,
+ "learning_rate": 3.45e-06,
+ "loss": 0.5847,
+ "step": 69
+ },
+ {
+ "epoch": 0.329153605015674,
+ "grad_norm": 0.6093555092811584,
+ "learning_rate": 3.5e-06,
+ "loss": 0.6552,
+ "step": 70
+ },
+ {
+ "epoch": 0.3338557993730408,
+ "grad_norm": 0.6106334328651428,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.6196,
+ "step": 71
+ },
+ {
+ "epoch": 0.3385579937304075,
+ "grad_norm": 0.9254828691482544,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.6005,
+ "step": 72
+ },
+ {
+ "epoch": 0.3432601880877743,
+ "grad_norm": 0.5471694469451904,
+ "learning_rate": 3.65e-06,
+ "loss": 0.5907,
+ "step": 73
+ },
+ {
+ "epoch": 0.34796238244514105,
+ "grad_norm": 0.6204228401184082,
+ "learning_rate": 3.7e-06,
+ "loss": 0.6079,
+ "step": 74
+ },
+ {
+ "epoch": 0.35266457680250785,
+ "grad_norm": 0.52458256483078,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.6001,
+ "step": 75
+ },
+ {
+ "epoch": 0.3573667711598746,
+ "grad_norm": 0.5356763601303101,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.5987,
+ "step": 76
+ },
+ {
+ "epoch": 0.3620689655172414,
+ "grad_norm": 0.5408467054367065,
+ "learning_rate": 3.85e-06,
+ "loss": 0.6104,
+ "step": 77
+ },
+ {
+ "epoch": 0.3667711598746082,
+ "grad_norm": 0.5075871348381042,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.5569,
+ "step": 78
+ },
+ {
+ "epoch": 0.3714733542319749,
+ "grad_norm": 0.8474109768867493,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6195,
+ "step": 79
+ },
+ {
+ "epoch": 0.3761755485893417,
+ "grad_norm": 0.4750897288322449,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.5399,
+ "step": 80
+ },
+ {
+ "epoch": 0.38087774294670845,
+ "grad_norm": 0.5082002878189087,
+ "learning_rate": 4.05e-06,
+ "loss": 0.5997,
+ "step": 81
+ },
+ {
+ "epoch": 0.38557993730407525,
+ "grad_norm": 0.5343796014785767,
+ "learning_rate": 4.1e-06,
+ "loss": 0.5704,
+ "step": 82
+ },
+ {
+ "epoch": 0.390282131661442,
+ "grad_norm": 0.520311713218689,
+ "learning_rate": 4.15e-06,
+ "loss": 0.5818,
+ "step": 83
+ },
+ {
+ "epoch": 0.3949843260188088,
+ "grad_norm": 0.5292978286743164,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.5852,
+ "step": 84
+ },
+ {
+ "epoch": 0.3996865203761755,
+ "grad_norm": 0.539886474609375,
+ "learning_rate": 4.25e-06,
+ "loss": 0.6057,
+ "step": 85
+ },
+ {
+ "epoch": 0.4043887147335423,
+ "grad_norm": 0.6468827128410339,
+ "learning_rate": 4.3e-06,
+ "loss": 0.6122,
+ "step": 86
+ },
+ {
+ "epoch": 0.4090909090909091,
+ "grad_norm": 0.5537365078926086,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.5652,
+ "step": 87
+ },
+ {
+ "epoch": 0.41379310344827586,
+ "grad_norm": 0.6226018667221069,
+ "learning_rate": 4.4e-06,
+ "loss": 0.5884,
+ "step": 88
+ },
+ {
+ "epoch": 0.41849529780564265,
+ "grad_norm": 0.5016945004463196,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.5877,
+ "step": 89
+ },
+ {
+ "epoch": 0.4231974921630094,
+ "grad_norm": 0.5059167146682739,
+ "learning_rate": 4.5e-06,
+ "loss": 0.5676,
+ "step": 90
+ },
+ {
+ "epoch": 0.4278996865203762,
+ "grad_norm": 0.47521743178367615,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.5929,
+ "step": 91
+ },
+ {
+ "epoch": 0.43260188087774293,
+ "grad_norm": 0.531306266784668,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.5983,
+ "step": 92
+ },
+ {
+ "epoch": 0.4373040752351097,
+ "grad_norm": 0.4965567886829376,
+ "learning_rate": 4.65e-06,
+ "loss": 0.5279,
+ "step": 93
+ },
+ {
+ "epoch": 0.44200626959247646,
+ "grad_norm": 0.5125988125801086,
+ "learning_rate": 4.7e-06,
+ "loss": 0.5436,
+ "step": 94
+ },
+ {
+ "epoch": 0.44670846394984326,
+ "grad_norm": 0.557763934135437,
+ "learning_rate": 4.75e-06,
+ "loss": 0.5496,
+ "step": 95
+ },
+ {
+ "epoch": 0.45141065830721006,
+ "grad_norm": 0.6993274092674255,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.5498,
+ "step": 96
+ },
+ {
+ "epoch": 0.4561128526645768,
+ "grad_norm": 0.5485453009605408,
+ "learning_rate": 4.85e-06,
+ "loss": 0.5552,
+ "step": 97
+ },
+ {
+ "epoch": 0.4608150470219436,
+ "grad_norm": 1.9821522235870361,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.569,
+ "step": 98
+ },
+ {
+ "epoch": 0.46551724137931033,
+ "grad_norm": 0.6074144840240479,
+ "learning_rate": 4.95e-06,
+ "loss": 0.5546,
+ "step": 99
+ },
+ {
+ "epoch": 0.4702194357366771,
+ "grad_norm": 0.5404040813446045,
+ "learning_rate": 5e-06,
+ "loss": 0.5775,
+ "step": 100
+ },
+ {
+ "epoch": 0.47492163009404387,
+ "grad_norm": 0.500438928604126,
+ "learning_rate": 4.9999910183883085e-06,
+ "loss": 0.5569,
+ "step": 101
+ },
+ {
+ "epoch": 0.47962382445141066,
+ "grad_norm": 0.5036981701850891,
+ "learning_rate": 4.999964073617768e-06,
+ "loss": 0.5663,
+ "step": 102
+ },
+ {
+ "epoch": 0.4843260188087774,
+ "grad_norm": 0.4537642300128937,
+ "learning_rate": 4.999919165881985e-06,
+ "loss": 0.5527,
+ "step": 103
+ },
+ {
+ "epoch": 0.4890282131661442,
+ "grad_norm": 0.49653521180152893,
+ "learning_rate": 4.999856295503635e-06,
+ "loss": 0.563,
+ "step": 104
+ },
+ {
+ "epoch": 0.493730407523511,
+ "grad_norm": 0.46847566962242126,
+ "learning_rate": 4.9997754629344596e-06,
+ "loss": 0.5425,
+ "step": 105
+ },
+ {
+ "epoch": 0.49843260188087773,
+ "grad_norm": 0.5192411541938782,
+ "learning_rate": 4.999676668755263e-06,
+ "loss": 0.5315,
+ "step": 106
+ },
+ {
+ "epoch": 0.5031347962382445,
+ "grad_norm": 0.5170287489891052,
+ "learning_rate": 4.999559913675912e-06,
+ "loss": 0.5627,
+ "step": 107
+ },
+ {
+ "epoch": 0.5078369905956113,
+ "grad_norm": 0.47297438979148865,
+ "learning_rate": 4.999425198535325e-06,
+ "loss": 0.5432,
+ "step": 108
+ },
+ {
+ "epoch": 0.512539184952978,
+ "grad_norm": 0.4873776137828827,
+ "learning_rate": 4.999272524301469e-06,
+ "loss": 0.5473,
+ "step": 109
+ },
+ {
+ "epoch": 0.5172413793103449,
+ "grad_norm": 0.5432935357093811,
+ "learning_rate": 4.9991018920713505e-06,
+ "loss": 0.5642,
+ "step": 110
+ },
+ {
+ "epoch": 0.5219435736677116,
+ "grad_norm": 0.4850105345249176,
+ "learning_rate": 4.9989133030710154e-06,
+ "loss": 0.548,
+ "step": 111
+ },
+ {
+ "epoch": 0.5266457680250783,
+ "grad_norm": 0.9399585723876953,
+ "learning_rate": 4.9987067586555275e-06,
+ "loss": 0.5471,
+ "step": 112
+ },
+ {
+ "epoch": 0.5313479623824452,
+ "grad_norm": 0.5167811512947083,
+ "learning_rate": 4.998482260308969e-06,
+ "loss": 0.5648,
+ "step": 113
+ },
+ {
+ "epoch": 0.5360501567398119,
+ "grad_norm": 0.5069029927253723,
+ "learning_rate": 4.998239809644427e-06,
+ "loss": 0.5568,
+ "step": 114
+ },
+ {
+ "epoch": 0.5407523510971787,
+ "grad_norm": 0.8738563656806946,
+ "learning_rate": 4.9979794084039755e-06,
+ "loss": 0.5719,
+ "step": 115
+ },
+ {
+ "epoch": 0.5454545454545454,
+ "grad_norm": 0.5216553807258606,
+ "learning_rate": 4.997701058458677e-06,
+ "loss": 0.5309,
+ "step": 116
+ },
+ {
+ "epoch": 0.5501567398119123,
+ "grad_norm": 0.9678344130516052,
+ "learning_rate": 4.997404761808554e-06,
+ "loss": 0.5645,
+ "step": 117
+ },
+ {
+ "epoch": 0.554858934169279,
+ "grad_norm": 0.496598482131958,
+ "learning_rate": 4.9970905205825845e-06,
+ "loss": 0.5711,
+ "step": 118
+ },
+ {
+ "epoch": 0.5595611285266457,
+ "grad_norm": 0.4745199680328369,
+ "learning_rate": 4.996758337038683e-06,
+ "loss": 0.5613,
+ "step": 119
+ },
+ {
+ "epoch": 0.5642633228840125,
+ "grad_norm": 0.5595977902412415,
+ "learning_rate": 4.996408213563684e-06,
+ "loss": 0.5559,
+ "step": 120
+ },
+ {
+ "epoch": 0.5689655172413793,
+ "grad_norm": 0.4743712544441223,
+ "learning_rate": 4.996040152673326e-06,
+ "loss": 0.5228,
+ "step": 121
+ },
+ {
+ "epoch": 0.5736677115987461,
+ "grad_norm": 0.5418100953102112,
+ "learning_rate": 4.995654157012233e-06,
+ "loss": 0.536,
+ "step": 122
+ },
+ {
+ "epoch": 0.5783699059561128,
+ "grad_norm": 0.521977424621582,
+ "learning_rate": 4.995250229353895e-06,
+ "loss": 0.5305,
+ "step": 123
+ },
+ {
+ "epoch": 0.5830721003134797,
+ "grad_norm": 0.5062761902809143,
+ "learning_rate": 4.99482837260065e-06,
+ "loss": 0.5417,
+ "step": 124
+ },
+ {
+ "epoch": 0.5877742946708464,
+ "grad_norm": 0.5895913243293762,
+ "learning_rate": 4.99438858978366e-06,
+ "loss": 0.573,
+ "step": 125
+ },
+ {
+ "epoch": 0.5924764890282131,
+ "grad_norm": 0.5442466139793396,
+ "learning_rate": 4.993930884062892e-06,
+ "loss": 0.5563,
+ "step": 126
+ },
+ {
+ "epoch": 0.5971786833855799,
+ "grad_norm": 0.5130571722984314,
+ "learning_rate": 4.993455258727094e-06,
+ "loss": 0.5549,
+ "step": 127
+ },
+ {
+ "epoch": 0.6018808777429467,
+ "grad_norm": 0.5579081773757935,
+ "learning_rate": 4.992961717193773e-06,
+ "loss": 0.5554,
+ "step": 128
+ },
+ {
+ "epoch": 0.6065830721003135,
+ "grad_norm": 0.6375890374183655,
+ "learning_rate": 4.9924502630091655e-06,
+ "loss": 0.5626,
+ "step": 129
+ },
+ {
+ "epoch": 0.6112852664576802,
+ "grad_norm": 0.5129190683364868,
+ "learning_rate": 4.99192089984822e-06,
+ "loss": 0.5493,
+ "step": 130
+ },
+ {
+ "epoch": 0.6159874608150471,
+ "grad_norm": 0.5293419361114502,
+ "learning_rate": 4.9913736315145614e-06,
+ "loss": 0.5565,
+ "step": 131
+ },
+ {
+ "epoch": 0.6206896551724138,
+ "grad_norm": 0.6502572298049927,
+ "learning_rate": 4.990808461940474e-06,
+ "loss": 0.5358,
+ "step": 132
+ },
+ {
+ "epoch": 0.6253918495297806,
+ "grad_norm": 0.5450296998023987,
+ "learning_rate": 4.990225395186862e-06,
+ "loss": 0.5421,
+ "step": 133
+ },
+ {
+ "epoch": 0.6300940438871473,
+ "grad_norm": 0.45506399869918823,
+ "learning_rate": 4.9896244354432314e-06,
+ "loss": 0.5396,
+ "step": 134
+ },
+ {
+ "epoch": 0.6347962382445141,
+ "grad_norm": 0.5095545649528503,
+ "learning_rate": 4.98900558702765e-06,
+ "loss": 0.5486,
+ "step": 135
+ },
+ {
+ "epoch": 0.6394984326018809,
+ "grad_norm": 0.4836446940898895,
+ "learning_rate": 4.9883688543867225e-06,
+ "loss": 0.5596,
+ "step": 136
+ },
+ {
+ "epoch": 0.6442006269592476,
+ "grad_norm": 0.5253512859344482,
+ "learning_rate": 4.987714242095558e-06,
+ "loss": 0.5308,
+ "step": 137
+ },
+ {
+ "epoch": 0.6489028213166145,
+ "grad_norm": 0.8280164003372192,
+ "learning_rate": 4.9870417548577355e-06,
+ "loss": 0.5349,
+ "step": 138
+ },
+ {
+ "epoch": 0.6536050156739812,
+ "grad_norm": 0.4729730188846588,
+ "learning_rate": 4.9863513975052696e-06,
+ "loss": 0.5416,
+ "step": 139
+ },
+ {
+ "epoch": 0.658307210031348,
+ "grad_norm": 0.5932718515396118,
+ "learning_rate": 4.985643174998578e-06,
+ "loss": 0.5638,
+ "step": 140
+ },
+ {
+ "epoch": 0.6630094043887147,
+ "grad_norm": 0.5187026262283325,
+ "learning_rate": 4.984917092426445e-06,
+ "loss": 0.5507,
+ "step": 141
+ },
+ {
+ "epoch": 0.6677115987460815,
+ "grad_norm": 0.5024245977401733,
+ "learning_rate": 4.984173155005982e-06,
+ "loss": 0.5406,
+ "step": 142
+ },
+ {
+ "epoch": 0.6724137931034483,
+ "grad_norm": 0.4735509157180786,
+ "learning_rate": 4.983411368082597e-06,
+ "loss": 0.5431,
+ "step": 143
+ },
+ {
+ "epoch": 0.677115987460815,
+ "grad_norm": 0.5040024518966675,
+ "learning_rate": 4.982631737129948e-06,
+ "loss": 0.5291,
+ "step": 144
+ },
+ {
+ "epoch": 0.6818181818181818,
+ "grad_norm": 0.47764894366264343,
+ "learning_rate": 4.98183426774991e-06,
+ "loss": 0.5677,
+ "step": 145
+ },
+ {
+ "epoch": 0.6865203761755486,
+ "grad_norm": 0.5211489796638489,
+ "learning_rate": 4.981018965672529e-06,
+ "loss": 0.566,
+ "step": 146
+ },
+ {
+ "epoch": 0.6912225705329154,
+ "grad_norm": 1.022007942199707,
+ "learning_rate": 4.98018583675599e-06,
+ "loss": 0.5476,
+ "step": 147
+ },
+ {
+ "epoch": 0.6959247648902821,
+ "grad_norm": 0.5263912677764893,
+ "learning_rate": 4.979334886986562e-06,
+ "loss": 0.5473,
+ "step": 148
+ },
+ {
+ "epoch": 0.700626959247649,
+ "grad_norm": 0.5014091730117798,
+ "learning_rate": 4.978466122478567e-06,
+ "loss": 0.5642,
+ "step": 149
+ },
+ {
+ "epoch": 0.7053291536050157,
+ "grad_norm": 0.5003350973129272,
+ "learning_rate": 4.97757954947433e-06,
+ "loss": 0.5311,
+ "step": 150
+ },
+ {
+ "epoch": 0.7100313479623824,
+ "grad_norm": 0.5010690093040466,
+ "learning_rate": 4.976675174344132e-06,
+ "loss": 0.5469,
+ "step": 151
+ },
+ {
+ "epoch": 0.7147335423197492,
+ "grad_norm": 0.45779237151145935,
+ "learning_rate": 4.975753003586172e-06,
+ "loss": 0.5273,
+ "step": 152
+ },
+ {
+ "epoch": 0.719435736677116,
+ "grad_norm": 0.6231539845466614,
+ "learning_rate": 4.974813043826513e-06,
+ "loss": 0.5182,
+ "step": 153
+ },
+ {
+ "epoch": 0.7241379310344828,
+ "grad_norm": 0.5361394286155701,
+ "learning_rate": 4.973855301819039e-06,
+ "loss": 0.5372,
+ "step": 154
+ },
+ {
+ "epoch": 0.7288401253918495,
+ "grad_norm": 0.5193538665771484,
+ "learning_rate": 4.972879784445402e-06,
+ "loss": 0.5201,
+ "step": 155
+ },
+ {
+ "epoch": 0.7335423197492164,
+ "grad_norm": 0.47956809401512146,
+ "learning_rate": 4.971886498714978e-06,
+ "loss": 0.5402,
+ "step": 156
+ },
+ {
+ "epoch": 0.7382445141065831,
+ "grad_norm": 0.5303016901016235,
+ "learning_rate": 4.97087545176481e-06,
+ "loss": 0.5174,
+ "step": 157
+ },
+ {
+ "epoch": 0.7429467084639498,
+ "grad_norm": 0.5002286434173584,
+ "learning_rate": 4.9698466508595655e-06,
+ "loss": 0.5453,
+ "step": 158
+ },
+ {
+ "epoch": 0.7476489028213166,
+ "grad_norm": 0.6070297360420227,
+ "learning_rate": 4.9688001033914756e-06,
+ "loss": 0.5327,
+ "step": 159
+ },
+ {
+ "epoch": 0.7523510971786834,
+ "grad_norm": 0.5436793565750122,
+ "learning_rate": 4.967735816880286e-06,
+ "loss": 0.544,
+ "step": 160
+ },
+ {
+ "epoch": 0.7570532915360502,
+ "grad_norm": 0.538012683391571,
+ "learning_rate": 4.966653798973205e-06,
+ "loss": 0.5233,
+ "step": 161
+ },
+ {
+ "epoch": 0.7617554858934169,
+ "grad_norm": 0.4916169345378876,
+ "learning_rate": 4.965554057444842e-06,
+ "loss": 0.5168,
+ "step": 162
+ },
+ {
+ "epoch": 0.7664576802507836,
+ "grad_norm": 0.48281437158584595,
+ "learning_rate": 4.964436600197161e-06,
+ "loss": 0.5393,
+ "step": 163
+ },
+ {
+ "epoch": 0.7711598746081505,
+ "grad_norm": 0.5184990167617798,
+ "learning_rate": 4.963301435259413e-06,
+ "loss": 0.5085,
+ "step": 164
+ },
+ {
+ "epoch": 0.7758620689655172,
+ "grad_norm": 0.4706438183784485,
+ "learning_rate": 4.962148570788088e-06,
+ "loss": 0.5299,
+ "step": 165
+ },
+ {
+ "epoch": 0.780564263322884,
+ "grad_norm": 0.6550764441490173,
+ "learning_rate": 4.96097801506685e-06,
+ "loss": 0.5192,
+ "step": 166
+ },
+ {
+ "epoch": 0.7852664576802508,
+ "grad_norm": 0.5386581420898438,
+ "learning_rate": 4.959789776506482e-06,
+ "loss": 0.5258,
+ "step": 167
+ },
+ {
+ "epoch": 0.7899686520376176,
+ "grad_norm": 0.5060779452323914,
+ "learning_rate": 4.958583863644821e-06,
+ "loss": 0.5512,
+ "step": 168
+ },
+ {
+ "epoch": 0.7946708463949843,
+ "grad_norm": 0.47050032019615173,
+ "learning_rate": 4.9573602851466985e-06,
+ "loss": 0.5176,
+ "step": 169
+ },
+ {
+ "epoch": 0.799373040752351,
+ "grad_norm": 7.3139567375183105,
+ "learning_rate": 4.9561190498038815e-06,
+ "loss": 0.5381,
+ "step": 170
+ },
+ {
+ "epoch": 0.8040752351097179,
+ "grad_norm": 0.620528519153595,
+ "learning_rate": 4.954860166535005e-06,
+ "loss": 0.5299,
+ "step": 171
+ },
+ {
+ "epoch": 0.8087774294670846,
+ "grad_norm": 0.45067766308784485,
+ "learning_rate": 4.95358364438551e-06,
+ "loss": 0.5328,
+ "step": 172
+ },
+ {
+ "epoch": 0.8134796238244514,
+ "grad_norm": 0.6771508455276489,
+ "learning_rate": 4.952289492527576e-06,
+ "loss": 0.5601,
+ "step": 173
+ },
+ {
+ "epoch": 0.8181818181818182,
+ "grad_norm": 0.518925130367279,
+ "learning_rate": 4.9509777202600605e-06,
+ "loss": 0.494,
+ "step": 174
+ },
+ {
+ "epoch": 0.822884012539185,
+ "grad_norm": 0.5191988945007324,
+ "learning_rate": 4.949648337008425e-06,
+ "loss": 0.5425,
+ "step": 175
+ },
+ {
+ "epoch": 0.8275862068965517,
+ "grad_norm": 0.8600963354110718,
+ "learning_rate": 4.948301352324674e-06,
+ "loss": 0.5332,
+ "step": 176
+ },
+ {
+ "epoch": 0.8322884012539185,
+ "grad_norm": 0.5405915379524231,
+ "learning_rate": 4.946936775887281e-06,
+ "loss": 0.5276,
+ "step": 177
+ },
+ {
+ "epoch": 0.8369905956112853,
+ "grad_norm": 0.48730772733688354,
+ "learning_rate": 4.945554617501124e-06,
+ "loss": 0.5217,
+ "step": 178
+ },
+ {
+ "epoch": 0.841692789968652,
+ "grad_norm": 0.5092865824699402,
+ "learning_rate": 4.944154887097411e-06,
+ "loss": 0.5534,
+ "step": 179
+ },
+ {
+ "epoch": 0.8463949843260188,
+ "grad_norm": 0.4994933605194092,
+ "learning_rate": 4.942737594733608e-06,
+ "loss": 0.5242,
+ "step": 180
+ },
+ {
+ "epoch": 0.8510971786833855,
+ "grad_norm": 0.4554043412208557,
+ "learning_rate": 4.941302750593373e-06,
+ "loss": 0.5424,
+ "step": 181
+ },
+ {
+ "epoch": 0.8557993730407524,
+ "grad_norm": 0.4865265488624573,
+ "learning_rate": 4.939850364986475e-06,
+ "loss": 0.482,
+ "step": 182
+ },
+ {
+ "epoch": 0.8605015673981191,
+ "grad_norm": 0.5013875365257263,
+ "learning_rate": 4.938380448348725e-06,
+ "loss": 0.4908,
+ "step": 183
+ },
+ {
+ "epoch": 0.8652037617554859,
+ "grad_norm": 0.4997917115688324,
+ "learning_rate": 4.9368930112419e-06,
+ "loss": 0.5336,
+ "step": 184
+ },
+ {
+ "epoch": 0.8699059561128527,
+ "grad_norm": 0.4783482551574707,
+ "learning_rate": 4.935388064353665e-06,
+ "loss": 0.5338,
+ "step": 185
+ },
+ {
+ "epoch": 0.8746081504702194,
+ "grad_norm": 0.7221089005470276,
+ "learning_rate": 4.9338656184975e-06,
+ "loss": 0.5327,
+ "step": 186
+ },
+ {
+ "epoch": 0.8793103448275862,
+ "grad_norm": 0.48115843534469604,
+ "learning_rate": 4.932325684612618e-06,
+ "loss": 0.5408,
+ "step": 187
+ },
+ {
+ "epoch": 0.8840125391849529,
+ "grad_norm": 0.4940219223499298,
+ "learning_rate": 4.93076827376389e-06,
+ "loss": 0.5455,
+ "step": 188
+ },
+ {
+ "epoch": 0.8887147335423198,
+ "grad_norm": 0.4754747450351715,
+ "learning_rate": 4.9291933971417635e-06,
+ "loss": 0.542,
+ "step": 189
+ },
+ {
+ "epoch": 0.8934169278996865,
+ "grad_norm": 0.548713207244873,
+ "learning_rate": 4.9276010660621835e-06,
+ "loss": 0.5292,
+ "step": 190
+ },
+ {
+ "epoch": 0.8981191222570533,
+ "grad_norm": 0.7292612195014954,
+ "learning_rate": 4.925991291966508e-06,
+ "loss": 0.5073,
+ "step": 191
+ },
+ {
+ "epoch": 0.9028213166144201,
+ "grad_norm": 0.5254770517349243,
+ "learning_rate": 4.92436408642143e-06,
+ "loss": 0.5451,
+ "step": 192
+ },
+ {
+ "epoch": 0.9075235109717869,
+ "grad_norm": 0.47938767075538635,
+ "learning_rate": 4.9227194611188934e-06,
+ "loss": 0.5204,
+ "step": 193
+ },
+ {
+ "epoch": 0.9122257053291536,
+ "grad_norm": 0.6740232706069946,
+ "learning_rate": 4.921057427876007e-06,
+ "loss": 0.4928,
+ "step": 194
+ },
+ {
+ "epoch": 0.9169278996865203,
+ "grad_norm": 0.5455343723297119,
+ "learning_rate": 4.919377998634959e-06,
+ "loss": 0.5468,
+ "step": 195
+ },
+ {
+ "epoch": 0.9216300940438872,
+ "grad_norm": 0.5001958012580872,
+ "learning_rate": 4.917681185462934e-06,
+ "loss": 0.5339,
+ "step": 196
+ },
+ {
+ "epoch": 0.9263322884012539,
+ "grad_norm": 0.5084257125854492,
+ "learning_rate": 4.915967000552028e-06,
+ "loss": 0.5259,
+ "step": 197
+ },
+ {
+ "epoch": 0.9310344827586207,
+ "grad_norm": 0.4807164967060089,
+ "learning_rate": 4.914235456219154e-06,
+ "loss": 0.5204,
+ "step": 198
+ },
+ {
+ "epoch": 0.9357366771159875,
+ "grad_norm": 0.6099370718002319,
+ "learning_rate": 4.912486564905959e-06,
+ "loss": 0.544,
+ "step": 199
+ },
+ {
+ "epoch": 0.9404388714733543,
+ "grad_norm": 0.47461947798728943,
+ "learning_rate": 4.910720339178735e-06,
+ "loss": 0.5295,
+ "step": 200
+ },
+ {
+ "epoch": 0.945141065830721,
+ "grad_norm": 0.500136137008667,
+ "learning_rate": 4.908936791728323e-06,
+ "loss": 0.5321,
+ "step": 201
+ },
+ {
+ "epoch": 0.9498432601880877,
+ "grad_norm": 0.5235631465911865,
+ "learning_rate": 4.907135935370027e-06,
+ "loss": 0.5338,
+ "step": 202
+ },
+ {
+ "epoch": 0.9545454545454546,
+ "grad_norm": 0.9285804629325867,
+ "learning_rate": 4.905317783043523e-06,
+ "loss": 0.5393,
+ "step": 203
+ },
+ {
+ "epoch": 0.9592476489028213,
+ "grad_norm": 0.4834178388118744,
+ "learning_rate": 4.9034823478127605e-06,
+ "loss": 0.5211,
+ "step": 204
+ },
+ {
+ "epoch": 0.9639498432601881,
+ "grad_norm": 0.4830580949783325,
+ "learning_rate": 4.901629642865872e-06,
+ "loss": 0.4986,
+ "step": 205
+ },
+ {
+ "epoch": 0.9686520376175548,
+ "grad_norm": 0.49718615412712097,
+ "learning_rate": 4.89975968151508e-06,
+ "loss": 0.5204,
+ "step": 206
+ },
+ {
+ "epoch": 0.9733542319749217,
+ "grad_norm": 0.5056726336479187,
+ "learning_rate": 4.8978724771965965e-06,
+ "loss": 0.5133,
+ "step": 207
+ },
+ {
+ "epoch": 0.9780564263322884,
+ "grad_norm": 0.7357563376426697,
+ "learning_rate": 4.895968043470532e-06,
+ "loss": 0.5307,
+ "step": 208
+ },
+ {
+ "epoch": 0.9827586206896551,
+ "grad_norm": 0.515610933303833,
+ "learning_rate": 4.894046394020794e-06,
+ "loss": 0.4955,
+ "step": 209
+ },
+ {
+ "epoch": 0.987460815047022,
+ "grad_norm": 0.5124618411064148,
+ "learning_rate": 4.892107542654988e-06,
+ "loss": 0.526,
+ "step": 210
+ },
+ {
+ "epoch": 0.9921630094043887,
+ "grad_norm": 0.5059565901756287,
+ "learning_rate": 4.890151503304325e-06,
+ "loss": 0.5473,
+ "step": 211
+ },
+ {
+ "epoch": 0.9968652037617555,
+ "grad_norm": 0.4806717336177826,
+ "learning_rate": 4.88817829002351e-06,
+ "loss": 0.5212,
+ "step": 212
+ },
+ {
+ "epoch": 1.0047021943573669,
+ "grad_norm": 0.9454345703125,
+ "learning_rate": 4.886187916990653e-06,
+ "loss": 1.0566,
+ "step": 213
+ },
+ {
+ "epoch": 1.0094043887147335,
+ "grad_norm": 0.4871070086956024,
+ "learning_rate": 4.884180398507163e-06,
+ "loss": 0.503,
+ "step": 214
+ },
+ {
+ "epoch": 1.0141065830721003,
+ "grad_norm": 0.45102012157440186,
+ "learning_rate": 4.882155748997636e-06,
+ "loss": 0.4954,
+ "step": 215
+ },
+ {
+ "epoch": 1.0188087774294672,
+ "grad_norm": 0.49910685420036316,
+ "learning_rate": 4.8801139830097685e-06,
+ "loss": 0.5019,
+ "step": 216
+ },
+ {
+ "epoch": 1.0235109717868338,
+ "grad_norm": 0.5155763030052185,
+ "learning_rate": 4.878055115214238e-06,
+ "loss": 0.5102,
+ "step": 217
+ },
+ {
+ "epoch": 1.0282131661442007,
+ "grad_norm": 0.4567059874534607,
+ "learning_rate": 4.875979160404607e-06,
+ "loss": 0.5069,
+ "step": 218
+ },
+ {
+ "epoch": 1.0329153605015673,
+ "grad_norm": 0.4782896935939789,
+ "learning_rate": 4.873886133497209e-06,
+ "loss": 0.5182,
+ "step": 219
+ },
+ {
+ "epoch": 1.0376175548589341,
+ "grad_norm": 0.44995731115341187,
+ "learning_rate": 4.87177604953105e-06,
+ "loss": 0.513,
+ "step": 220
+ },
+ {
+ "epoch": 1.042319749216301,
+ "grad_norm": 0.470059871673584,
+ "learning_rate": 4.869648923667694e-06,
+ "loss": 0.468,
+ "step": 221
+ },
+ {
+ "epoch": 1.0470219435736676,
+ "grad_norm": 0.5356128215789795,
+ "learning_rate": 4.867504771191154e-06,
+ "loss": 0.4942,
+ "step": 222
+ },
+ {
+ "epoch": 1.0517241379310345,
+ "grad_norm": 0.5137870907783508,
+ "learning_rate": 4.865343607507788e-06,
+ "loss": 0.5022,
+ "step": 223
+ },
+ {
+ "epoch": 1.0564263322884013,
+ "grad_norm": 0.47419992089271545,
+ "learning_rate": 4.86316544814618e-06,
+ "loss": 0.5158,
+ "step": 224
+ },
+ {
+ "epoch": 1.061128526645768,
+ "grad_norm": 0.49087393283843994,
+ "learning_rate": 4.860970308757038e-06,
+ "loss": 0.4605,
+ "step": 225
+ },
+ {
+ "epoch": 1.0658307210031348,
+ "grad_norm": 0.4988348186016083,
+ "learning_rate": 4.858758205113072e-06,
+ "loss": 0.4912,
+ "step": 226
+ },
+ {
+ "epoch": 1.0705329153605017,
+ "grad_norm": 0.44543248414993286,
+ "learning_rate": 4.856529153108888e-06,
+ "loss": 0.524,
+ "step": 227
+ },
+ {
+ "epoch": 1.0752351097178683,
+ "grad_norm": 0.5953351259231567,
+ "learning_rate": 4.854283168760868e-06,
+ "loss": 0.5001,
+ "step": 228
+ },
+ {
+ "epoch": 1.0799373040752351,
+ "grad_norm": 0.5012004375457764,
+ "learning_rate": 4.85202026820706e-06,
+ "loss": 0.4968,
+ "step": 229
+ },
+ {
+ "epoch": 1.084639498432602,
+ "grad_norm": 0.5023937821388245,
+ "learning_rate": 4.84974046770706e-06,
+ "loss": 0.5345,
+ "step": 230
+ },
+ {
+ "epoch": 1.0893416927899686,
+ "grad_norm": 0.4705684185028076,
+ "learning_rate": 4.847443783641893e-06,
+ "loss": 0.4459,
+ "step": 231
+ },
+ {
+ "epoch": 1.0940438871473355,
+ "grad_norm": 0.5082476735115051,
+ "learning_rate": 4.845130232513901e-06,
+ "loss": 0.4905,
+ "step": 232
+ },
+ {
+ "epoch": 1.098746081504702,
+ "grad_norm": 0.5283995866775513,
+ "learning_rate": 4.842799830946615e-06,
+ "loss": 0.4878,
+ "step": 233
+ },
+ {
+ "epoch": 1.103448275862069,
+ "grad_norm": 0.6373623013496399,
+ "learning_rate": 4.840452595684646e-06,
+ "loss": 0.4867,
+ "step": 234
+ },
+ {
+ "epoch": 1.1081504702194358,
+ "grad_norm": 0.4624481201171875,
+ "learning_rate": 4.83808854359356e-06,
+ "loss": 0.4793,
+ "step": 235
+ },
+ {
+ "epoch": 1.1128526645768024,
+ "grad_norm": 0.4659098982810974,
+ "learning_rate": 4.835707691659753e-06,
+ "loss": 0.4827,
+ "step": 236
+ },
+ {
+ "epoch": 1.1175548589341693,
+ "grad_norm": 0.4920850396156311,
+ "learning_rate": 4.8333100569903365e-06,
+ "loss": 0.4932,
+ "step": 237
+ },
+ {
+ "epoch": 1.1222570532915361,
+ "grad_norm": 0.492286741733551,
+ "learning_rate": 4.8308956568130094e-06,
+ "loss": 0.5144,
+ "step": 238
+ },
+ {
+ "epoch": 1.1269592476489028,
+ "grad_norm": 0.5429807901382446,
+ "learning_rate": 4.828464508475934e-06,
+ "loss": 0.5054,
+ "step": 239
+ },
+ {
+ "epoch": 1.1316614420062696,
+ "grad_norm": 2.4671998023986816,
+ "learning_rate": 4.826016629447616e-06,
+ "loss": 0.5073,
+ "step": 240
+ },
+ {
+ "epoch": 1.1363636363636362,
+ "grad_norm": 0.4593118131160736,
+ "learning_rate": 4.823552037316775e-06,
+ "loss": 0.4856,
+ "step": 241
+ },
+ {
+ "epoch": 1.141065830721003,
+ "grad_norm": 0.6855646371841431,
+ "learning_rate": 4.821070749792218e-06,
+ "loss": 0.5388,
+ "step": 242
+ },
+ {
+ "epoch": 1.14576802507837,
+ "grad_norm": 0.5722374320030212,
+ "learning_rate": 4.818572784702713e-06,
+ "loss": 0.51,
+ "step": 243
+ },
+ {
+ "epoch": 1.1504702194357366,
+ "grad_norm": 0.4901357591152191,
+ "learning_rate": 4.816058159996863e-06,
+ "loss": 0.5201,
+ "step": 244
+ },
+ {
+ "epoch": 1.1551724137931034,
+ "grad_norm": 0.4655209481716156,
+ "learning_rate": 4.813526893742972e-06,
+ "loss": 0.501,
+ "step": 245
+ },
+ {
+ "epoch": 1.1598746081504703,
+ "grad_norm": 0.7608394622802734,
+ "learning_rate": 4.810979004128924e-06,
+ "loss": 0.4961,
+ "step": 246
+ },
+ {
+ "epoch": 1.164576802507837,
+ "grad_norm": 0.4857081472873688,
+ "learning_rate": 4.808414509462042e-06,
+ "loss": 0.5174,
+ "step": 247
+ },
+ {
+ "epoch": 1.1692789968652038,
+ "grad_norm": 0.46672946214675903,
+ "learning_rate": 4.80583342816896e-06,
+ "loss": 0.484,
+ "step": 248
+ },
+ {
+ "epoch": 1.1739811912225706,
+ "grad_norm": 0.46982088685035706,
+ "learning_rate": 4.803235778795496e-06,
+ "loss": 0.5236,
+ "step": 249
+ },
+ {
+ "epoch": 1.1786833855799372,
+ "grad_norm": 0.5086098909378052,
+ "learning_rate": 4.800621580006511e-06,
+ "loss": 0.4673,
+ "step": 250
+ },
+ {
+ "epoch": 1.183385579937304,
+ "grad_norm": 0.45968860387802124,
+ "learning_rate": 4.797990850585782e-06,
+ "loss": 0.5151,
+ "step": 251
+ },
+ {
+ "epoch": 1.188087774294671,
+ "grad_norm": 0.49544984102249146,
+ "learning_rate": 4.79534360943586e-06,
+ "loss": 0.494,
+ "step": 252
+ },
+ {
+ "epoch": 1.1927899686520376,
+ "grad_norm": 0.531892716884613,
+ "learning_rate": 4.792679875577937e-06,
+ "loss": 0.4778,
+ "step": 253
+ },
+ {
+ "epoch": 1.1974921630094044,
+ "grad_norm": 0.5013542175292969,
+ "learning_rate": 4.789999668151714e-06,
+ "loss": 0.5132,
+ "step": 254
+ },
+ {
+ "epoch": 1.2021943573667713,
+ "grad_norm": 0.46963250637054443,
+ "learning_rate": 4.7873030064152545e-06,
+ "loss": 0.4938,
+ "step": 255
+ },
+ {
+ "epoch": 1.206896551724138,
+ "grad_norm": 0.465285986661911,
+ "learning_rate": 4.784589909744856e-06,
+ "loss": 0.4898,
+ "step": 256
+ },
+ {
+ "epoch": 1.2115987460815048,
+ "grad_norm": 0.5183936357498169,
+ "learning_rate": 4.7818603976349005e-06,
+ "loss": 0.5004,
+ "step": 257
+ },
+ {
+ "epoch": 1.2163009404388714,
+ "grad_norm": 0.47324836254119873,
+ "learning_rate": 4.779114489697724e-06,
+ "loss": 0.4972,
+ "step": 258
+ },
+ {
+ "epoch": 1.2210031347962382,
+ "grad_norm": 0.5208264589309692,
+ "learning_rate": 4.776352205663469e-06,
+ "loss": 0.5023,
+ "step": 259
+ },
+ {
+ "epoch": 1.225705329153605,
+ "grad_norm": 0.5583804845809937,
+ "learning_rate": 4.773573565379947e-06,
+ "loss": 0.5099,
+ "step": 260
+ },
+ {
+ "epoch": 1.2304075235109717,
+ "grad_norm": 0.5016160011291504,
+ "learning_rate": 4.770778588812489e-06,
+ "loss": 0.4765,
+ "step": 261
+ },
+ {
+ "epoch": 1.2351097178683386,
+ "grad_norm": 0.50210040807724,
+ "learning_rate": 4.7679672960438135e-06,
+ "loss": 0.5029,
+ "step": 262
+ },
+ {
+ "epoch": 1.2398119122257054,
+ "grad_norm": 0.6636150479316711,
+ "learning_rate": 4.765139707273872e-06,
+ "loss": 0.4909,
+ "step": 263
+ },
+ {
+ "epoch": 1.244514106583072,
+ "grad_norm": 0.4798625111579895,
+ "learning_rate": 4.762295842819707e-06,
+ "loss": 0.5012,
+ "step": 264
+ },
+ {
+ "epoch": 1.249216300940439,
+ "grad_norm": 0.5282374024391174,
+ "learning_rate": 4.759435723115308e-06,
+ "loss": 0.4681,
+ "step": 265
+ },
+ {
+ "epoch": 1.2539184952978055,
+ "grad_norm": 0.5356930494308472,
+ "learning_rate": 4.756559368711463e-06,
+ "loss": 0.506,
+ "step": 266
+ },
+ {
+ "epoch": 1.2586206896551724,
+ "grad_norm": 0.4857093095779419,
+ "learning_rate": 4.75366680027561e-06,
+ "loss": 0.4889,
+ "step": 267
+ },
+ {
+ "epoch": 1.2633228840125392,
+ "grad_norm": 0.484018474817276,
+ "learning_rate": 4.7507580385916906e-06,
+ "loss": 0.4899,
+ "step": 268
+ },
+ {
+ "epoch": 1.2680250783699059,
+ "grad_norm": 0.49720871448516846,
+ "learning_rate": 4.747833104559999e-06,
+ "loss": 0.4654,
+ "step": 269
+ },
+ {
+ "epoch": 1.2727272727272727,
+ "grad_norm": 0.4631911516189575,
+ "learning_rate": 4.744892019197033e-06,
+ "loss": 0.4796,
+ "step": 270
+ },
+ {
+ "epoch": 1.2774294670846396,
+ "grad_norm": 0.5116872787475586,
+ "learning_rate": 4.74193480363534e-06,
+ "loss": 0.4883,
+ "step": 271
+ },
+ {
+ "epoch": 1.2821316614420062,
+ "grad_norm": 0.5275093913078308,
+ "learning_rate": 4.738961479123373e-06,
+ "loss": 0.496,
+ "step": 272
+ },
+ {
+ "epoch": 1.286833855799373,
+ "grad_norm": 0.5001885890960693,
+ "learning_rate": 4.735972067025326e-06,
+ "loss": 0.5012,
+ "step": 273
+ },
+ {
+ "epoch": 1.29153605015674,
+ "grad_norm": 0.5875861048698425,
+ "learning_rate": 4.732966588820991e-06,
+ "loss": 0.4951,
+ "step": 274
+ },
+ {
+ "epoch": 1.2962382445141065,
+ "grad_norm": 0.4893011748790741,
+ "learning_rate": 4.729945066105599e-06,
+ "loss": 0.4742,
+ "step": 275
+ },
+ {
+ "epoch": 1.3009404388714734,
+ "grad_norm": 0.4648543894290924,
+ "learning_rate": 4.726907520589664e-06,
+ "loss": 0.466,
+ "step": 276
+ },
+ {
+ "epoch": 1.3056426332288402,
+ "grad_norm": 0.5300162434577942,
+ "learning_rate": 4.72385397409883e-06,
+ "loss": 0.5072,
+ "step": 277
+ },
+ {
+ "epoch": 1.3103448275862069,
+ "grad_norm": 0.4667080044746399,
+ "learning_rate": 4.720784448573712e-06,
+ "loss": 0.4986,
+ "step": 278
+ },
+ {
+ "epoch": 1.3150470219435737,
+ "grad_norm": 0.5278895497322083,
+ "learning_rate": 4.717698966069739e-06,
+ "loss": 0.5269,
+ "step": 279
+ },
+ {
+ "epoch": 1.3197492163009406,
+ "grad_norm": 0.5325866937637329,
+ "learning_rate": 4.7145975487569965e-06,
+ "loss": 0.5074,
+ "step": 280
+ },
+ {
+ "epoch": 1.3244514106583072,
+ "grad_norm": 0.500861644744873,
+ "learning_rate": 4.711480218920064e-06,
+ "loss": 0.4695,
+ "step": 281
+ },
+ {
+ "epoch": 1.329153605015674,
+ "grad_norm": 0.5263222455978394,
+ "learning_rate": 4.708346998957859e-06,
+ "loss": 0.5173,
+ "step": 282
+ },
+ {
+ "epoch": 1.3338557993730409,
+ "grad_norm": 0.622900128364563,
+ "learning_rate": 4.705197911383473e-06,
+ "loss": 0.4905,
+ "step": 283
+ },
+ {
+ "epoch": 1.3385579937304075,
+ "grad_norm": 0.49273768067359924,
+ "learning_rate": 4.7020329788240115e-06,
+ "loss": 0.4743,
+ "step": 284
+ },
+ {
+ "epoch": 1.3432601880877744,
+ "grad_norm": 0.49558964371681213,
+ "learning_rate": 4.6988522240204325e-06,
+ "loss": 0.4824,
+ "step": 285
+ },
+ {
+ "epoch": 1.347962382445141,
+ "grad_norm": 0.4743976891040802,
+ "learning_rate": 4.695655669827377e-06,
+ "loss": 0.4977,
+ "step": 286
+ },
+ {
+ "epoch": 1.3526645768025078,
+ "grad_norm": 0.49542659521102905,
+ "learning_rate": 4.6924433392130135e-06,
+ "loss": 0.4924,
+ "step": 287
+ },
+ {
+ "epoch": 1.3573667711598745,
+ "grad_norm": 0.7385990619659424,
+ "learning_rate": 4.689215255258866e-06,
+ "loss": 0.5091,
+ "step": 288
+ },
+ {
+ "epoch": 1.3620689655172413,
+ "grad_norm": 0.4826123118400574,
+ "learning_rate": 4.685971441159653e-06,
+ "loss": 0.4791,
+ "step": 289
+ },
+ {
+ "epoch": 1.3667711598746082,
+ "grad_norm": 0.5389033555984497,
+ "learning_rate": 4.682711920223115e-06,
+ "loss": 0.4751,
+ "step": 290
+ },
+ {
+ "epoch": 1.3714733542319748,
+ "grad_norm": 0.5059546232223511,
+ "learning_rate": 4.679436715869856e-06,
+ "loss": 0.499,
+ "step": 291
+ },
+ {
+ "epoch": 1.3761755485893417,
+ "grad_norm": 0.5682849884033203,
+ "learning_rate": 4.676145851633166e-06,
+ "loss": 0.5143,
+ "step": 292
+ },
+ {
+ "epoch": 1.3808777429467085,
+ "grad_norm": 0.4754337668418884,
+ "learning_rate": 4.672839351158856e-06,
+ "loss": 0.4997,
+ "step": 293
+ },
+ {
+ "epoch": 1.3855799373040751,
+ "grad_norm": 0.5227643847465515,
+ "learning_rate": 4.669517238205089e-06,
+ "loss": 0.4834,
+ "step": 294
+ },
+ {
+ "epoch": 1.390282131661442,
+ "grad_norm": 0.4954044222831726,
+ "learning_rate": 4.666179536642208e-06,
+ "loss": 0.483,
+ "step": 295
+ },
+ {
+ "epoch": 1.3949843260188088,
+ "grad_norm": 0.4909021556377411,
+ "learning_rate": 4.662826270452565e-06,
+ "loss": 0.4808,
+ "step": 296
+ },
+ {
+ "epoch": 1.3996865203761755,
+ "grad_norm": 0.4666971266269684,
+ "learning_rate": 4.659457463730347e-06,
+ "loss": 0.488,
+ "step": 297
+ },
+ {
+ "epoch": 1.4043887147335423,
+ "grad_norm": 0.5064187049865723,
+ "learning_rate": 4.6560731406814056e-06,
+ "loss": 0.5046,
+ "step": 298
+ },
+ {
+ "epoch": 1.4090909090909092,
+ "grad_norm": 0.4958318769931793,
+ "learning_rate": 4.65267332562308e-06,
+ "loss": 0.5102,
+ "step": 299
+ },
+ {
+ "epoch": 1.4137931034482758,
+ "grad_norm": 0.5080632567405701,
+ "learning_rate": 4.649258042984026e-06,
+ "loss": 0.5055,
+ "step": 300
+ },
+ {
+ "epoch": 1.4184952978056427,
+ "grad_norm": 0.46236541867256165,
+ "learning_rate": 4.6458273173040395e-06,
+ "loss": 0.4606,
+ "step": 301
+ },
+ {
+ "epoch": 1.4231974921630095,
+ "grad_norm": 1.8524898290634155,
+ "learning_rate": 4.642381173233874e-06,
+ "loss": 0.5002,
+ "step": 302
+ },
+ {
+ "epoch": 1.4278996865203761,
+ "grad_norm": 0.5202615261077881,
+ "learning_rate": 4.638919635535073e-06,
+ "loss": 0.4562,
+ "step": 303
+ },
+ {
+ "epoch": 1.432601880877743,
+ "grad_norm": 0.5293647050857544,
+ "learning_rate": 4.635442729079788e-06,
+ "loss": 0.4806,
+ "step": 304
+ },
+ {
+ "epoch": 1.4373040752351098,
+ "grad_norm": 0.5165356993675232,
+ "learning_rate": 4.6319504788505956e-06,
+ "loss": 0.4775,
+ "step": 305
+ },
+ {
+ "epoch": 1.4420062695924765,
+ "grad_norm": 0.5092841386795044,
+ "learning_rate": 4.628442909940325e-06,
+ "loss": 0.4892,
+ "step": 306
+ },
+ {
+ "epoch": 1.4467084639498433,
+ "grad_norm": 0.511424720287323,
+ "learning_rate": 4.624920047551874e-06,
+ "loss": 0.506,
+ "step": 307
+ },
+ {
+ "epoch": 1.4514106583072102,
+ "grad_norm": 0.5631566643714905,
+ "learning_rate": 4.621381916998029e-06,
+ "loss": 0.4741,
+ "step": 308
+ },
+ {
+ "epoch": 1.4561128526645768,
+ "grad_norm": 0.4748315215110779,
+ "learning_rate": 4.6178285437012806e-06,
+ "loss": 0.5084,
+ "step": 309
+ },
+ {
+ "epoch": 1.4608150470219436,
+ "grad_norm": 0.47158119082450867,
+ "learning_rate": 4.6142599531936435e-06,
+ "loss": 0.4697,
+ "step": 310
+ },
+ {
+ "epoch": 1.4655172413793103,
+ "grad_norm": 0.5358107089996338,
+ "learning_rate": 4.610676171116475e-06,
+ "loss": 0.491,
+ "step": 311
+ },
+ {
+ "epoch": 1.4702194357366771,
+ "grad_norm": 0.47717440128326416,
+ "learning_rate": 4.607077223220286e-06,
+ "loss": 0.4948,
+ "step": 312
+ },
+ {
+ "epoch": 1.4749216300940438,
+ "grad_norm": 0.5041193962097168,
+ "learning_rate": 4.603463135364556e-06,
+ "loss": 0.4648,
+ "step": 313
+ },
+ {
+ "epoch": 1.4796238244514106,
+ "grad_norm": 0.9311274290084839,
+ "learning_rate": 4.5998339335175555e-06,
+ "loss": 0.4866,
+ "step": 314
+ },
+ {
+ "epoch": 1.4843260188087775,
+ "grad_norm": 0.47408604621887207,
+ "learning_rate": 4.596189643756147e-06,
+ "loss": 0.4634,
+ "step": 315
+ },
+ {
+ "epoch": 1.489028213166144,
+ "grad_norm": 0.5052632093429565,
+ "learning_rate": 4.592530292265609e-06,
+ "loss": 0.4843,
+ "step": 316
+ },
+ {
+ "epoch": 1.493730407523511,
+ "grad_norm": 0.5100846886634827,
+ "learning_rate": 4.58885590533944e-06,
+ "loss": 0.4942,
+ "step": 317
+ },
+ {
+ "epoch": 1.4984326018808778,
+ "grad_norm": 0.5132214426994324,
+ "learning_rate": 4.585166509379173e-06,
+ "loss": 0.5135,
+ "step": 318
+ },
+ {
+ "epoch": 1.5031347962382444,
+ "grad_norm": 11.112855911254883,
+ "learning_rate": 4.581462130894186e-06,
+ "loss": 0.4933,
+ "step": 319
+ },
+ {
+ "epoch": 1.5078369905956113,
+ "grad_norm": 0.4873805642127991,
+ "learning_rate": 4.57774279650151e-06,
+ "loss": 0.483,
+ "step": 320
+ },
+ {
+ "epoch": 1.5125391849529781,
+ "grad_norm": 0.5026459693908691,
+ "learning_rate": 4.574008532925638e-06,
+ "loss": 0.5075,
+ "step": 321
+ },
+ {
+ "epoch": 1.5172413793103448,
+ "grad_norm": 0.489947110414505,
+ "learning_rate": 4.570259366998336e-06,
+ "loss": 0.4954,
+ "step": 322
+ },
+ {
+ "epoch": 1.5219435736677116,
+ "grad_norm": 0.48120853304862976,
+ "learning_rate": 4.566495325658445e-06,
+ "loss": 0.5221,
+ "step": 323
+ },
+ {
+ "epoch": 1.5266457680250785,
+ "grad_norm": 0.4880066514015198,
+ "learning_rate": 4.5627164359516915e-06,
+ "loss": 0.5031,
+ "step": 324
+ },
+ {
+ "epoch": 1.531347962382445,
+ "grad_norm": 0.5048410892486572,
+ "learning_rate": 4.558922725030491e-06,
+ "loss": 0.4757,
+ "step": 325
+ },
+ {
+ "epoch": 1.536050156739812,
+ "grad_norm": 0.7033756375312805,
+ "learning_rate": 4.555114220153755e-06,
+ "loss": 0.4285,
+ "step": 326
+ },
+ {
+ "epoch": 1.5407523510971788,
+ "grad_norm": 0.4716516435146332,
+ "learning_rate": 4.551290948686693e-06,
+ "loss": 0.5121,
+ "step": 327
+ },
+ {
+ "epoch": 1.5454545454545454,
+ "grad_norm": 0.4782696068286896,
+ "learning_rate": 4.547452938100615e-06,
+ "loss": 0.5176,
+ "step": 328
+ },
+ {
+ "epoch": 1.5501567398119123,
+ "grad_norm": 0.5119273066520691,
+ "learning_rate": 4.54360021597274e-06,
+ "loss": 0.4941,
+ "step": 329
+ },
+ {
+ "epoch": 1.5548589341692791,
+ "grad_norm": 0.5010069608688354,
+ "learning_rate": 4.539732809985989e-06,
+ "loss": 0.4862,
+ "step": 330
+ },
+ {
+ "epoch": 1.5595611285266457,
+ "grad_norm": 0.5129932165145874,
+ "learning_rate": 4.535850747928796e-06,
+ "loss": 0.4978,
+ "step": 331
+ },
+ {
+ "epoch": 1.5642633228840124,
+ "grad_norm": 0.4957594573497772,
+ "learning_rate": 4.531954057694897e-06,
+ "loss": 0.4814,
+ "step": 332
+ },
+ {
+ "epoch": 1.5689655172413794,
+ "grad_norm": 0.5642824172973633,
+ "learning_rate": 4.5280427672831414e-06,
+ "loss": 0.4888,
+ "step": 333
+ },
+ {
+ "epoch": 1.573667711598746,
+ "grad_norm": 0.4562854468822479,
+ "learning_rate": 4.524116904797281e-06,
+ "loss": 0.4648,
+ "step": 334
+ },
+ {
+ "epoch": 1.5783699059561127,
+ "grad_norm": 0.4849218428134918,
+ "learning_rate": 4.520176498445774e-06,
+ "loss": 0.476,
+ "step": 335
+ },
+ {
+ "epoch": 1.5830721003134798,
+ "grad_norm": 0.5046947002410889,
+ "learning_rate": 4.516221576541581e-06,
+ "loss": 0.4776,
+ "step": 336
+ },
+ {
+ "epoch": 1.5877742946708464,
+ "grad_norm": 0.48211777210235596,
+ "learning_rate": 4.512252167501959e-06,
+ "loss": 0.479,
+ "step": 337
+ },
+ {
+ "epoch": 1.592476489028213,
+ "grad_norm": 0.4812171459197998,
+ "learning_rate": 4.508268299848262e-06,
+ "loss": 0.4849,
+ "step": 338
+ },
+ {
+ "epoch": 1.59717868338558,
+ "grad_norm": 0.5865142345428467,
+ "learning_rate": 4.50427000220573e-06,
+ "loss": 0.499,
+ "step": 339
+ },
+ {
+ "epoch": 1.6018808777429467,
+ "grad_norm": 0.49277785420417786,
+ "learning_rate": 4.50025730330329e-06,
+ "loss": 0.475,
+ "step": 340
+ },
+ {
+ "epoch": 1.6065830721003134,
+ "grad_norm": 0.46771496534347534,
+ "learning_rate": 4.4962302319733445e-06,
+ "loss": 0.494,
+ "step": 341
+ },
+ {
+ "epoch": 1.6112852664576802,
+ "grad_norm": 0.5189441442489624,
+ "learning_rate": 4.492188817151565e-06,
+ "loss": 0.5275,
+ "step": 342
+ },
+ {
+ "epoch": 1.615987460815047,
+ "grad_norm": 0.48845574259757996,
+ "learning_rate": 4.488133087876688e-06,
+ "loss": 0.4676,
+ "step": 343
+ },
+ {
+ "epoch": 1.6206896551724137,
+ "grad_norm": 0.47189632058143616,
+ "learning_rate": 4.484063073290301e-06,
+ "loss": 0.4642,
+ "step": 344
+ },
+ {
+ "epoch": 1.6253918495297806,
+ "grad_norm": 0.5442587733268738,
+ "learning_rate": 4.479978802636637e-06,
+ "loss": 0.4981,
+ "step": 345
+ },
+ {
+ "epoch": 1.6300940438871474,
+ "grad_norm": 0.5048685073852539,
+ "learning_rate": 4.475880305262362e-06,
+ "loss": 0.5037,
+ "step": 346
+ },
+ {
+ "epoch": 1.634796238244514,
+ "grad_norm": 0.4781409800052643,
+ "learning_rate": 4.471767610616366e-06,
+ "loss": 0.4932,
+ "step": 347
+ },
+ {
+ "epoch": 1.6394984326018809,
+ "grad_norm": 0.47388938069343567,
+ "learning_rate": 4.467640748249549e-06,
+ "loss": 0.4687,
+ "step": 348
+ },
+ {
+ "epoch": 1.6442006269592477,
+ "grad_norm": 0.529712438583374,
+ "learning_rate": 4.4634997478146125e-06,
+ "loss": 0.487,
+ "step": 349
+ },
+ {
+ "epoch": 1.6489028213166144,
+ "grad_norm": 0.5114791393280029,
+ "learning_rate": 4.459344639065842e-06,
+ "loss": 0.4809,
+ "step": 350
+ },
+ {
+ "epoch": 1.6536050156739812,
+ "grad_norm": 0.45415258407592773,
+ "learning_rate": 4.455175451858897e-06,
+ "loss": 0.4901,
+ "step": 351
+ },
+ {
+ "epoch": 1.658307210031348,
+ "grad_norm": 0.5842339396476746,
+ "learning_rate": 4.450992216150592e-06,
+ "loss": 0.499,
+ "step": 352
+ },
+ {
+ "epoch": 1.6630094043887147,
+ "grad_norm": 0.48795560002326965,
+ "learning_rate": 4.446794961998689e-06,
+ "loss": 0.4659,
+ "step": 353
+ },
+ {
+ "epoch": 1.6677115987460815,
+ "grad_norm": 0.5531855225563049,
+ "learning_rate": 4.442583719561671e-06,
+ "loss": 0.4923,
+ "step": 354
+ },
+ {
+ "epoch": 1.6724137931034484,
+ "grad_norm": 0.5827644467353821,
+ "learning_rate": 4.438358519098536e-06,
+ "loss": 0.4991,
+ "step": 355
+ },
+ {
+ "epoch": 1.677115987460815,
+ "grad_norm": 0.5260423421859741,
+ "learning_rate": 4.4341193909685685e-06,
+ "loss": 0.4843,
+ "step": 356
+ },
+ {
+ "epoch": 1.6818181818181817,
+ "grad_norm": 0.4969344437122345,
+ "learning_rate": 4.429866365631134e-06,
+ "loss": 0.4915,
+ "step": 357
+ },
+ {
+ "epoch": 1.6865203761755487,
+ "grad_norm": 0.4725005030632019,
+ "learning_rate": 4.425599473645447e-06,
+ "loss": 0.4804,
+ "step": 358
+ },
+ {
+ "epoch": 1.6912225705329154,
+ "grad_norm": 0.47171467542648315,
+ "learning_rate": 4.421318745670364e-06,
+ "loss": 0.4823,
+ "step": 359
+ },
+ {
+ "epoch": 1.695924764890282,
+ "grad_norm": 0.4839799106121063,
+ "learning_rate": 4.4170242124641524e-06,
+ "loss": 0.4585,
+ "step": 360
+ },
+ {
+ "epoch": 1.700626959247649,
+ "grad_norm": 0.4786856472492218,
+ "learning_rate": 4.412715904884277e-06,
+ "loss": 0.49,
+ "step": 361
+ },
+ {
+ "epoch": 1.7053291536050157,
+ "grad_norm": 0.49980080127716064,
+ "learning_rate": 4.4083938538871735e-06,
+ "loss": 0.4675,
+ "step": 362
+ },
+ {
+ "epoch": 1.7100313479623823,
+ "grad_norm": 0.5201369524002075,
+ "learning_rate": 4.4040580905280295e-06,
+ "loss": 0.4862,
+ "step": 363
+ },
+ {
+ "epoch": 1.7147335423197492,
+ "grad_norm": 0.7051575183868408,
+ "learning_rate": 4.3997086459605586e-06,
+ "loss": 0.4822,
+ "step": 364
+ },
+ {
+ "epoch": 1.719435736677116,
+ "grad_norm": 0.48206666111946106,
+ "learning_rate": 4.395345551436779e-06,
+ "loss": 0.5076,
+ "step": 365
+ },
+ {
+ "epoch": 1.7241379310344827,
+ "grad_norm": 0.4817257821559906,
+ "learning_rate": 4.390968838306788e-06,
+ "loss": 0.4623,
+ "step": 366
+ },
+ {
+ "epoch": 1.7288401253918495,
+ "grad_norm": 0.5547840595245361,
+ "learning_rate": 4.386578538018535e-06,
+ "loss": 0.461,
+ "step": 367
+ },
+ {
+ "epoch": 1.7335423197492164,
+ "grad_norm": 0.5085346698760986,
+ "learning_rate": 4.382174682117598e-06,
+ "loss": 0.5068,
+ "step": 368
+ },
+ {
+ "epoch": 1.738244514106583,
+ "grad_norm": 0.4870692193508148,
+ "learning_rate": 4.377757302246956e-06,
+ "loss": 0.4403,
+ "step": 369
+ },
+ {
+ "epoch": 1.7429467084639498,
+ "grad_norm": 0.49482715129852295,
+ "learning_rate": 4.373326430146762e-06,
+ "loss": 0.4986,
+ "step": 370
+ },
+ {
+ "epoch": 1.7476489028213167,
+ "grad_norm": 0.5474854707717896,
+ "learning_rate": 4.368882097654113e-06,
+ "loss": 0.4938,
+ "step": 371
+ },
+ {
+ "epoch": 1.7523510971786833,
+ "grad_norm": 0.5055244565010071,
+ "learning_rate": 4.364424336702825e-06,
+ "loss": 0.4711,
+ "step": 372
+ },
+ {
+ "epoch": 1.7570532915360502,
+ "grad_norm": 0.48241329193115234,
+ "learning_rate": 4.3599531793232e-06,
+ "loss": 0.4856,
+ "step": 373
+ },
+ {
+ "epoch": 1.761755485893417,
+ "grad_norm": 0.4932602047920227,
+ "learning_rate": 4.355468657641797e-06,
+ "loss": 0.4818,
+ "step": 374
+ },
+ {
+ "epoch": 1.7664576802507836,
+ "grad_norm": 0.5512160658836365,
+ "learning_rate": 4.3509708038812035e-06,
+ "loss": 0.4864,
+ "step": 375
+ },
+ {
+ "epoch": 1.7711598746081505,
+ "grad_norm": 0.47026327252388,
+ "learning_rate": 4.346459650359798e-06,
+ "loss": 0.4825,
+ "step": 376
+ },
+ {
+ "epoch": 1.7758620689655173,
+ "grad_norm": 0.4831086993217468,
+ "learning_rate": 4.341935229491525e-06,
+ "loss": 0.4541,
+ "step": 377
+ },
+ {
+ "epoch": 1.780564263322884,
+ "grad_norm": 0.5045217871665955,
+ "learning_rate": 4.337397573785659e-06,
+ "loss": 0.5025,
+ "step": 378
+ },
+ {
+ "epoch": 1.7852664576802508,
+ "grad_norm": 0.5657753348350525,
+ "learning_rate": 4.332846715846566e-06,
+ "loss": 0.4698,
+ "step": 379
+ },
+ {
+ "epoch": 1.7899686520376177,
+ "grad_norm": 0.49546748399734497,
+ "learning_rate": 4.328282688373479e-06,
+ "loss": 0.4911,
+ "step": 380
+ },
+ {
+ "epoch": 1.7946708463949843,
+ "grad_norm": 0.5037291049957275,
+ "learning_rate": 4.323705524160258e-06,
+ "loss": 0.4877,
+ "step": 381
+ },
+ {
+ "epoch": 1.799373040752351,
+ "grad_norm": 0.5256901383399963,
+ "learning_rate": 4.319115256095149e-06,
+ "loss": 0.4662,
+ "step": 382
+ },
+ {
+ "epoch": 1.804075235109718,
+ "grad_norm": 0.4890702962875366,
+ "learning_rate": 4.314511917160557e-06,
+ "loss": 0.4683,
+ "step": 383
+ },
+ {
+ "epoch": 1.8087774294670846,
+ "grad_norm": 0.4724109470844269,
+ "learning_rate": 4.3098955404328045e-06,
+ "loss": 0.4602,
+ "step": 384
+ },
+ {
+ "epoch": 1.8134796238244513,
+ "grad_norm": 0.4933278560638428,
+ "learning_rate": 4.305266159081895e-06,
+ "loss": 0.4806,
+ "step": 385
+ },
+ {
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.5068219304084778,
+ "learning_rate": 4.3006238063712725e-06,
+ "loss": 0.4647,
+ "step": 386
+ },
+ {
+ "epoch": 1.822884012539185,
+ "grad_norm": 0.5293509364128113,
+ "learning_rate": 4.295968515657583e-06,
+ "loss": 0.4998,
+ "step": 387
+ },
+ {
+ "epoch": 1.8275862068965516,
+ "grad_norm": 0.4775199294090271,
+ "learning_rate": 4.29130032039044e-06,
+ "loss": 0.4821,
+ "step": 388
+ },
+ {
+ "epoch": 1.8322884012539185,
+ "grad_norm": 0.4914006292819977,
+ "learning_rate": 4.2866192541121755e-06,
+ "loss": 0.4735,
+ "step": 389
+ },
+ {
+ "epoch": 1.8369905956112853,
+ "grad_norm": 0.5009908080101013,
+ "learning_rate": 4.281925350457606e-06,
+ "loss": 0.4741,
+ "step": 390
+ },
+ {
+ "epoch": 1.841692789968652,
+ "grad_norm": 0.47211164236068726,
+ "learning_rate": 4.277218643153787e-06,
+ "loss": 0.4786,
+ "step": 391
+ },
+ {
+ "epoch": 1.8463949843260188,
+ "grad_norm": 1.9644113779067993,
+ "learning_rate": 4.272499166019771e-06,
+ "loss": 0.4759,
+ "step": 392
+ },
+ {
+ "epoch": 1.8510971786833856,
+ "grad_norm": 0.535971999168396,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.4665,
+ "step": 393
+ },
+ {
+ "epoch": 1.8557993730407523,
+ "grad_norm": 0.4666787385940552,
+ "learning_rate": 4.2630220379959006e-06,
+ "loss": 0.4417,
+ "step": 394
+ },
+ {
+ "epoch": 1.8605015673981191,
+ "grad_norm": 0.5976264476776123,
+ "learning_rate": 4.258264455201953e-06,
+ "loss": 0.4665,
+ "step": 395
+ },
+ {
+ "epoch": 1.865203761755486,
+ "grad_norm": 0.4814331531524658,
+ "learning_rate": 4.2534942387691335e-06,
+ "loss": 0.4896,
+ "step": 396
+ },
+ {
+ "epoch": 1.8699059561128526,
+ "grad_norm": 0.4929859936237335,
+ "learning_rate": 4.248711422972829e-06,
+ "loss": 0.4765,
+ "step": 397
+ },
+ {
+ "epoch": 1.8746081504702194,
+ "grad_norm": 0.517914354801178,
+ "learning_rate": 4.243916042178954e-06,
+ "loss": 0.4601,
+ "step": 398
+ },
+ {
+ "epoch": 1.8793103448275863,
+ "grad_norm": 0.47731271386146545,
+ "learning_rate": 4.239108130843709e-06,
+ "loss": 0.469,
+ "step": 399
+ },
+ {
+ "epoch": 1.884012539184953,
+ "grad_norm": 0.4939954876899719,
+ "learning_rate": 4.234287723513326e-06,
+ "loss": 0.4929,
+ "step": 400
+ },
+ {
+ "epoch": 1.8887147335423198,
+ "grad_norm": 0.48573923110961914,
+ "learning_rate": 4.229454854823827e-06,
+ "loss": 0.4913,
+ "step": 401
+ },
+ {
+ "epoch": 1.8934169278996866,
+ "grad_norm": 0.5146409273147583,
+ "learning_rate": 4.224609559500772e-06,
+ "loss": 0.502,
+ "step": 402
+ },
+ {
+ "epoch": 1.8981191222570533,
+ "grad_norm": 0.4884675443172455,
+ "learning_rate": 4.21975187235901e-06,
+ "loss": 0.4541,
+ "step": 403
+ },
+ {
+ "epoch": 1.90282131661442,
+ "grad_norm": 0.4871810972690582,
+ "learning_rate": 4.21488182830243e-06,
+ "loss": 0.4811,
+ "step": 404
+ },
+ {
+ "epoch": 1.907523510971787,
+ "grad_norm": 0.5089552402496338,
+ "learning_rate": 4.209999462323706e-06,
+ "loss": 0.4584,
+ "step": 405
+ },
+ {
+ "epoch": 1.9122257053291536,
+ "grad_norm": 0.6191231608390808,
+ "learning_rate": 4.20510480950405e-06,
+ "loss": 0.4885,
+ "step": 406
+ },
+ {
+ "epoch": 1.9169278996865202,
+ "grad_norm": 0.5512096285820007,
+ "learning_rate": 4.200197905012961e-06,
+ "loss": 0.4529,
+ "step": 407
+ },
+ {
+ "epoch": 1.9216300940438873,
+ "grad_norm": 0.4743112027645111,
+ "learning_rate": 4.195278784107965e-06,
+ "loss": 0.4702,
+ "step": 408
+ },
+ {
+ "epoch": 1.926332288401254,
+ "grad_norm": 0.4635118544101715,
+ "learning_rate": 4.19034748213437e-06,
+ "loss": 0.4718,
+ "step": 409
+ },
+ {
+ "epoch": 1.9310344827586206,
+ "grad_norm": 0.48715919256210327,
+ "learning_rate": 4.185404034525008e-06,
+ "loss": 0.4638,
+ "step": 410
+ },
+ {
+ "epoch": 1.9357366771159876,
+ "grad_norm": 0.5373724102973938,
+ "learning_rate": 4.180448476799981e-06,
+ "loss": 0.5009,
+ "step": 411
+ },
+ {
+ "epoch": 1.9404388714733543,
+ "grad_norm": 0.4978715479373932,
+ "learning_rate": 4.175480844566404e-06,
+ "loss": 0.4726,
+ "step": 412
+ },
+ {
+ "epoch": 1.9451410658307209,
+ "grad_norm": 0.44817060232162476,
+ "learning_rate": 4.170501173518152e-06,
+ "loss": 0.4683,
+ "step": 413
+ },
+ {
+ "epoch": 1.9498432601880877,
+ "grad_norm": 0.48472973704338074,
+ "learning_rate": 4.165509499435604e-06,
+ "loss": 0.4662,
+ "step": 414
+ },
+ {
+ "epoch": 1.9545454545454546,
+ "grad_norm": 0.6567174792289734,
+ "learning_rate": 4.16050585818538e-06,
+ "loss": 0.4801,
+ "step": 415
+ },
+ {
+ "epoch": 1.9592476489028212,
+ "grad_norm": 0.5131425857543945,
+ "learning_rate": 4.155490285720092e-06,
+ "loss": 0.5036,
+ "step": 416
+ },
+ {
+ "epoch": 1.963949843260188,
+ "grad_norm": 0.46051982045173645,
+ "learning_rate": 4.150462818078079e-06,
+ "loss": 0.4911,
+ "step": 417
+ },
+ {
+ "epoch": 1.968652037617555,
+ "grad_norm": 0.5288883447647095,
+ "learning_rate": 4.145423491383153e-06,
+ "loss": 0.4871,
+ "step": 418
+ },
+ {
+ "epoch": 1.9733542319749215,
+ "grad_norm": 0.5143817663192749,
+ "learning_rate": 4.14037234184433e-06,
+ "loss": 0.5027,
+ "step": 419
+ },
+ {
+ "epoch": 1.9780564263322884,
+ "grad_norm": 0.46323707699775696,
+ "learning_rate": 4.135309405755583e-06,
+ "loss": 0.4876,
+ "step": 420
+ },
+ {
+ "epoch": 1.9827586206896552,
+ "grad_norm": 0.5239706039428711,
+ "learning_rate": 4.130234719495574e-06,
+ "loss": 0.4702,
+ "step": 421
+ },
+ {
+ "epoch": 1.9874608150470219,
+ "grad_norm": 0.538753867149353,
+ "learning_rate": 4.125148319527391e-06,
+ "loss": 0.4638,
+ "step": 422
+ },
+ {
+ "epoch": 1.9921630094043887,
+ "grad_norm": 0.5180181860923767,
+ "learning_rate": 4.1200502423982904e-06,
+ "loss": 0.4841,
+ "step": 423
+ },
+ {
+ "epoch": 1.9968652037617556,
+ "grad_norm": 0.6698167324066162,
+ "learning_rate": 4.1149405247394295e-06,
+ "loss": 0.4882,
+ "step": 424
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 1272,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 212,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.7298140993196392e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-424/training_args.bin b/checkpoint-424/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7db90ca60ea3c300feb3b7d6e0cb54fc7cfb2060
--- /dev/null
+++ b/checkpoint-424/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51f85402b182fc4b86518e0cb9ca9cbf150300e36000a38f53507b9a8663ad4b
+size 7928
diff --git a/checkpoint-424/zero_to_fp32.py b/checkpoint-424/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-424/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-636/README.md b/checkpoint-636/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b184114a0c28ed3e4c082c18486736dc818166d
--- /dev/null
+++ b/checkpoint-636/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.3-70B-Instruct
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-636/adapter_config.json b/checkpoint-636/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc930e1be2d901773c96d6e6d186c72676cbf328
--- /dev/null
+++ b/checkpoint-636/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "up_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "q_proj",
+ "k_proj",
+ "down_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-636/adapter_model.safetensors b/checkpoint-636/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ce70e87cd83c1773474bfd7f9065ebd19db4ae0e
--- /dev/null
+++ b/checkpoint-636/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6975ff65fd415fc72937ab6223f4a219557bb0917db1a986593a170f48c71101
+size 10829849744
diff --git a/checkpoint-636/global_step636/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-636/global_step636/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4f487f48d41b8d7a68fa5a0b598f14c3e329ce05
--- /dev/null
+++ b/checkpoint-636/global_step636/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38a025141cd5edb4a8d71d40f4be280c9085848ecd6af14aed1bb471b7e19fcb
+size 21659418140
diff --git a/checkpoint-636/global_step636/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-636/global_step636/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..52c058e2edf030c3e094a3b2667f36605f58c952
--- /dev/null
+++ b/checkpoint-636/global_step636/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08b2bb75339a7d5aac8bdf7b42c3a94316fa585a4b8c3b6b9b3a721db40eb4bc
+size 21659457372
diff --git a/checkpoint-636/global_step636/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-636/global_step636/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..94f97b3a2f8200fa77f30f58ebeb8d0c90f6984e
--- /dev/null
+++ b/checkpoint-636/global_step636/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ae130c5fd74adba1278eaece3030ebb2c396f2fe1e04f9070300d6a21638f8b
+size 21659417820
diff --git a/checkpoint-636/global_step636/mp_rank_00_model_states.pt b/checkpoint-636/global_step636/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9e41bfed8b143bac7640761a02387e1f037b5548
--- /dev/null
+++ b/checkpoint-636/global_step636/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cdc01042bdd187ade2e2666c781209656617df187ee8293131629fb679d01064
+size 11918643933
diff --git a/checkpoint-636/latest b/checkpoint-636/latest
new file mode 100644
index 0000000000000000000000000000000000000000..7cb7374b21fd322c33b2aed487af4a75e2644bec
--- /dev/null
+++ b/checkpoint-636/latest
@@ -0,0 +1 @@
+global_step636
\ No newline at end of file
diff --git a/checkpoint-636/rng_state_0.pth b/checkpoint-636/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d72822bdc5a15cf0512d7e3d7af0d637eb544886
--- /dev/null
+++ b/checkpoint-636/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb07a812b61bd3dab34426d42d0ec5638ca7a0af6a41608304a6db213359979a
+size 14768
diff --git a/checkpoint-636/rng_state_1.pth b/checkpoint-636/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b38947f4b1c435baefadf866d239a520d1934ab5
--- /dev/null
+++ b/checkpoint-636/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51b5c45b835addff11d442db4109f939084398c4d613fcb8e7bcbf268cc0ad87
+size 14768
diff --git a/checkpoint-636/rng_state_2.pth b/checkpoint-636/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..999e4c7bf62ed81a323883e13bfc3c1175120e1c
--- /dev/null
+++ b/checkpoint-636/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b34067505aa56e55eb9cbaf091cab75cf81e923f196cee6bca0beb3e156123c
+size 14768
diff --git a/checkpoint-636/scheduler.pt b/checkpoint-636/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b80151909c758cb7e9302c9d85f3b766a33fb693
--- /dev/null
+++ b/checkpoint-636/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e16d798a384ea04695b1fd65577c8a60a817755b2ef9a9dc02ea8f3f0187f11
+size 1064
diff --git a/checkpoint-636/special_tokens_map.json b/checkpoint-636/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-636/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-636/tokenizer.json b/checkpoint-636/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-636/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-636/tokenizer_config.json b/checkpoint-636/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b
--- /dev/null
+++ b/checkpoint-636/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-636/trainer_state.json b/checkpoint-636/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f236da090a433f728586b0ca8a91e1d7118badbd
--- /dev/null
+++ b/checkpoint-636/trainer_state.json
@@ -0,0 +1,4485 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.9968652037617556,
+ "eval_steps": 500,
+ "global_step": 636,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.004702194357366771,
+ "grad_norm": 3.1606569290161133,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 1.0072,
+ "step": 1
+ },
+ {
+ "epoch": 0.009404388714733543,
+ "grad_norm": 3.2058725357055664,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 1.0134,
+ "step": 2
+ },
+ {
+ "epoch": 0.014106583072100314,
+ "grad_norm": 2.636291265487671,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 0.9635,
+ "step": 3
+ },
+ {
+ "epoch": 0.018808777429467086,
+ "grad_norm": 2.708746910095215,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 1.0068,
+ "step": 4
+ },
+ {
+ "epoch": 0.023510971786833857,
+ "grad_norm": 2.8948426246643066,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 0.9608,
+ "step": 5
+ },
+ {
+ "epoch": 0.02821316614420063,
+ "grad_norm": 2.8740086555480957,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 0.9896,
+ "step": 6
+ },
+ {
+ "epoch": 0.032915360501567396,
+ "grad_norm": 2.8338170051574707,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 0.9098,
+ "step": 7
+ },
+ {
+ "epoch": 0.03761755485893417,
+ "grad_norm": 2.7783002853393555,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 0.9733,
+ "step": 8
+ },
+ {
+ "epoch": 0.04231974921630094,
+ "grad_norm": 3.043574333190918,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 0.9943,
+ "step": 9
+ },
+ {
+ "epoch": 0.047021943573667714,
+ "grad_norm": 3.142383337020874,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 0.9475,
+ "step": 10
+ },
+ {
+ "epoch": 0.05172413793103448,
+ "grad_norm": 2.9817280769348145,
+ "learning_rate": 5.5e-07,
+ "loss": 0.9701,
+ "step": 11
+ },
+ {
+ "epoch": 0.05642633228840126,
+ "grad_norm": 2.95699405670166,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 0.9983,
+ "step": 12
+ },
+ {
+ "epoch": 0.061128526645768025,
+ "grad_norm": 2.8782453536987305,
+ "learning_rate": 6.5e-07,
+ "loss": 0.9502,
+ "step": 13
+ },
+ {
+ "epoch": 0.06583072100313479,
+ "grad_norm": 2.6715071201324463,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 0.9436,
+ "step": 14
+ },
+ {
+ "epoch": 0.07053291536050156,
+ "grad_norm": 3.869649648666382,
+ "learning_rate": 7.5e-07,
+ "loss": 0.9692,
+ "step": 15
+ },
+ {
+ "epoch": 0.07523510971786834,
+ "grad_norm": 3.060220956802368,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 0.9258,
+ "step": 16
+ },
+ {
+ "epoch": 0.07993730407523511,
+ "grad_norm": 2.8922741413116455,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 0.9719,
+ "step": 17
+ },
+ {
+ "epoch": 0.08463949843260188,
+ "grad_norm": 2.7857820987701416,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 0.9072,
+ "step": 18
+ },
+ {
+ "epoch": 0.08934169278996865,
+ "grad_norm": 2.9753293991088867,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 0.9032,
+ "step": 19
+ },
+ {
+ "epoch": 0.09404388714733543,
+ "grad_norm": 2.7989683151245117,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 0.8887,
+ "step": 20
+ },
+ {
+ "epoch": 0.0987460815047022,
+ "grad_norm": 2.3953049182891846,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 0.8968,
+ "step": 21
+ },
+ {
+ "epoch": 0.10344827586206896,
+ "grad_norm": 2.643731117248535,
+ "learning_rate": 1.1e-06,
+ "loss": 0.8501,
+ "step": 22
+ },
+ {
+ "epoch": 0.10815047021943573,
+ "grad_norm": 2.3679006099700928,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 0.8476,
+ "step": 23
+ },
+ {
+ "epoch": 0.11285266457680251,
+ "grad_norm": 2.5935540199279785,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 0.8095,
+ "step": 24
+ },
+ {
+ "epoch": 0.11755485893416928,
+ "grad_norm": 2.510300636291504,
+ "learning_rate": 1.25e-06,
+ "loss": 0.8099,
+ "step": 25
+ },
+ {
+ "epoch": 0.12225705329153605,
+ "grad_norm": 2.372344970703125,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7869,
+ "step": 26
+ },
+ {
+ "epoch": 0.12695924764890282,
+ "grad_norm": 2.303426504135132,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.7758,
+ "step": 27
+ },
+ {
+ "epoch": 0.13166144200626959,
+ "grad_norm": 1.9017939567565918,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.7498,
+ "step": 28
+ },
+ {
+ "epoch": 0.13636363636363635,
+ "grad_norm": 1.8810580968856812,
+ "learning_rate": 1.45e-06,
+ "loss": 0.7878,
+ "step": 29
+ },
+ {
+ "epoch": 0.14106583072100312,
+ "grad_norm": 1.7797424793243408,
+ "learning_rate": 1.5e-06,
+ "loss": 0.7747,
+ "step": 30
+ },
+ {
+ "epoch": 0.14576802507836992,
+ "grad_norm": 1.5053879022598267,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.7735,
+ "step": 31
+ },
+ {
+ "epoch": 0.15047021943573669,
+ "grad_norm": 1.4909234046936035,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.7654,
+ "step": 32
+ },
+ {
+ "epoch": 0.15517241379310345,
+ "grad_norm": 1.36083984375,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.6895,
+ "step": 33
+ },
+ {
+ "epoch": 0.15987460815047022,
+ "grad_norm": 1.536014199256897,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.675,
+ "step": 34
+ },
+ {
+ "epoch": 0.164576802507837,
+ "grad_norm": 1.3426779508590698,
+ "learning_rate": 1.75e-06,
+ "loss": 0.7652,
+ "step": 35
+ },
+ {
+ "epoch": 0.16927899686520376,
+ "grad_norm": 1.4900612831115723,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.6863,
+ "step": 36
+ },
+ {
+ "epoch": 0.17398119122257052,
+ "grad_norm": 1.181241750717163,
+ "learning_rate": 1.85e-06,
+ "loss": 0.7136,
+ "step": 37
+ },
+ {
+ "epoch": 0.1786833855799373,
+ "grad_norm": 1.461419701576233,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.7606,
+ "step": 38
+ },
+ {
+ "epoch": 0.1833855799373041,
+ "grad_norm": 1.04817795753479,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.6829,
+ "step": 39
+ },
+ {
+ "epoch": 0.18808777429467086,
+ "grad_norm": 1.0499993562698364,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.7144,
+ "step": 40
+ },
+ {
+ "epoch": 0.19278996865203762,
+ "grad_norm": 0.9935064315795898,
+ "learning_rate": 2.05e-06,
+ "loss": 0.6736,
+ "step": 41
+ },
+ {
+ "epoch": 0.1974921630094044,
+ "grad_norm": 0.9919099807739258,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.7151,
+ "step": 42
+ },
+ {
+ "epoch": 0.20219435736677116,
+ "grad_norm": 0.919556200504303,
+ "learning_rate": 2.15e-06,
+ "loss": 0.6847,
+ "step": 43
+ },
+ {
+ "epoch": 0.20689655172413793,
+ "grad_norm": 1.4762015342712402,
+ "learning_rate": 2.2e-06,
+ "loss": 0.6694,
+ "step": 44
+ },
+ {
+ "epoch": 0.2115987460815047,
+ "grad_norm": 0.9243163466453552,
+ "learning_rate": 2.25e-06,
+ "loss": 0.6489,
+ "step": 45
+ },
+ {
+ "epoch": 0.21630094043887146,
+ "grad_norm": 0.7614469528198242,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.6568,
+ "step": 46
+ },
+ {
+ "epoch": 0.22100313479623823,
+ "grad_norm": 0.7543922662734985,
+ "learning_rate": 2.35e-06,
+ "loss": 0.6359,
+ "step": 47
+ },
+ {
+ "epoch": 0.22570532915360503,
+ "grad_norm": 0.7558912038803101,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.6231,
+ "step": 48
+ },
+ {
+ "epoch": 0.2304075235109718,
+ "grad_norm": 0.7822129130363464,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.6691,
+ "step": 49
+ },
+ {
+ "epoch": 0.23510971786833856,
+ "grad_norm": 0.8646999597549438,
+ "learning_rate": 2.5e-06,
+ "loss": 0.682,
+ "step": 50
+ },
+ {
+ "epoch": 0.23981191222570533,
+ "grad_norm": 0.8824774622917175,
+ "learning_rate": 2.55e-06,
+ "loss": 0.6805,
+ "step": 51
+ },
+ {
+ "epoch": 0.2445141065830721,
+ "grad_norm": 0.7697399258613586,
+ "learning_rate": 2.6e-06,
+ "loss": 0.6368,
+ "step": 52
+ },
+ {
+ "epoch": 0.24921630094043887,
+ "grad_norm": 0.6522512435913086,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.6367,
+ "step": 53
+ },
+ {
+ "epoch": 0.25391849529780564,
+ "grad_norm": 0.6172305941581726,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.6291,
+ "step": 54
+ },
+ {
+ "epoch": 0.25862068965517243,
+ "grad_norm": 0.7860460877418518,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.6736,
+ "step": 55
+ },
+ {
+ "epoch": 0.26332288401253917,
+ "grad_norm": 0.6474862694740295,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.6365,
+ "step": 56
+ },
+ {
+ "epoch": 0.26802507836990597,
+ "grad_norm": 0.6867114901542664,
+ "learning_rate": 2.85e-06,
+ "loss": 0.6397,
+ "step": 57
+ },
+ {
+ "epoch": 0.2727272727272727,
+ "grad_norm": 0.7056852579116821,
+ "learning_rate": 2.9e-06,
+ "loss": 0.6138,
+ "step": 58
+ },
+ {
+ "epoch": 0.2774294670846395,
+ "grad_norm": 0.6615664958953857,
+ "learning_rate": 2.95e-06,
+ "loss": 0.6482,
+ "step": 59
+ },
+ {
+ "epoch": 0.28213166144200624,
+ "grad_norm": 0.6649022102355957,
+ "learning_rate": 3e-06,
+ "loss": 0.6745,
+ "step": 60
+ },
+ {
+ "epoch": 0.28683385579937304,
+ "grad_norm": 0.850848913192749,
+ "learning_rate": 3.05e-06,
+ "loss": 0.5956,
+ "step": 61
+ },
+ {
+ "epoch": 0.29153605015673983,
+ "grad_norm": 0.5983562469482422,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.5894,
+ "step": 62
+ },
+ {
+ "epoch": 0.2962382445141066,
+ "grad_norm": 0.6286782622337341,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.6329,
+ "step": 63
+ },
+ {
+ "epoch": 0.30094043887147337,
+ "grad_norm": 0.5919945240020752,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.6402,
+ "step": 64
+ },
+ {
+ "epoch": 0.3056426332288401,
+ "grad_norm": 0.5632765889167786,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.5862,
+ "step": 65
+ },
+ {
+ "epoch": 0.3103448275862069,
+ "grad_norm": 0.7692590951919556,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.6031,
+ "step": 66
+ },
+ {
+ "epoch": 0.31504702194357365,
+ "grad_norm": 0.7313893437385559,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.6312,
+ "step": 67
+ },
+ {
+ "epoch": 0.31974921630094044,
+ "grad_norm": 0.6097120642662048,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.5986,
+ "step": 68
+ },
+ {
+ "epoch": 0.32445141065830724,
+ "grad_norm": 0.5853808522224426,
+ "learning_rate": 3.45e-06,
+ "loss": 0.5847,
+ "step": 69
+ },
+ {
+ "epoch": 0.329153605015674,
+ "grad_norm": 0.6093555092811584,
+ "learning_rate": 3.5e-06,
+ "loss": 0.6552,
+ "step": 70
+ },
+ {
+ "epoch": 0.3338557993730408,
+ "grad_norm": 0.6106334328651428,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.6196,
+ "step": 71
+ },
+ {
+ "epoch": 0.3385579937304075,
+ "grad_norm": 0.9254828691482544,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.6005,
+ "step": 72
+ },
+ {
+ "epoch": 0.3432601880877743,
+ "grad_norm": 0.5471694469451904,
+ "learning_rate": 3.65e-06,
+ "loss": 0.5907,
+ "step": 73
+ },
+ {
+ "epoch": 0.34796238244514105,
+ "grad_norm": 0.6204228401184082,
+ "learning_rate": 3.7e-06,
+ "loss": 0.6079,
+ "step": 74
+ },
+ {
+ "epoch": 0.35266457680250785,
+ "grad_norm": 0.52458256483078,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.6001,
+ "step": 75
+ },
+ {
+ "epoch": 0.3573667711598746,
+ "grad_norm": 0.5356763601303101,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.5987,
+ "step": 76
+ },
+ {
+ "epoch": 0.3620689655172414,
+ "grad_norm": 0.5408467054367065,
+ "learning_rate": 3.85e-06,
+ "loss": 0.6104,
+ "step": 77
+ },
+ {
+ "epoch": 0.3667711598746082,
+ "grad_norm": 0.5075871348381042,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.5569,
+ "step": 78
+ },
+ {
+ "epoch": 0.3714733542319749,
+ "grad_norm": 0.8474109768867493,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6195,
+ "step": 79
+ },
+ {
+ "epoch": 0.3761755485893417,
+ "grad_norm": 0.4750897288322449,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.5399,
+ "step": 80
+ },
+ {
+ "epoch": 0.38087774294670845,
+ "grad_norm": 0.5082002878189087,
+ "learning_rate": 4.05e-06,
+ "loss": 0.5997,
+ "step": 81
+ },
+ {
+ "epoch": 0.38557993730407525,
+ "grad_norm": 0.5343796014785767,
+ "learning_rate": 4.1e-06,
+ "loss": 0.5704,
+ "step": 82
+ },
+ {
+ "epoch": 0.390282131661442,
+ "grad_norm": 0.520311713218689,
+ "learning_rate": 4.15e-06,
+ "loss": 0.5818,
+ "step": 83
+ },
+ {
+ "epoch": 0.3949843260188088,
+ "grad_norm": 0.5292978286743164,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.5852,
+ "step": 84
+ },
+ {
+ "epoch": 0.3996865203761755,
+ "grad_norm": 0.539886474609375,
+ "learning_rate": 4.25e-06,
+ "loss": 0.6057,
+ "step": 85
+ },
+ {
+ "epoch": 0.4043887147335423,
+ "grad_norm": 0.6468827128410339,
+ "learning_rate": 4.3e-06,
+ "loss": 0.6122,
+ "step": 86
+ },
+ {
+ "epoch": 0.4090909090909091,
+ "grad_norm": 0.5537365078926086,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.5652,
+ "step": 87
+ },
+ {
+ "epoch": 0.41379310344827586,
+ "grad_norm": 0.6226018667221069,
+ "learning_rate": 4.4e-06,
+ "loss": 0.5884,
+ "step": 88
+ },
+ {
+ "epoch": 0.41849529780564265,
+ "grad_norm": 0.5016945004463196,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.5877,
+ "step": 89
+ },
+ {
+ "epoch": 0.4231974921630094,
+ "grad_norm": 0.5059167146682739,
+ "learning_rate": 4.5e-06,
+ "loss": 0.5676,
+ "step": 90
+ },
+ {
+ "epoch": 0.4278996865203762,
+ "grad_norm": 0.47521743178367615,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.5929,
+ "step": 91
+ },
+ {
+ "epoch": 0.43260188087774293,
+ "grad_norm": 0.531306266784668,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.5983,
+ "step": 92
+ },
+ {
+ "epoch": 0.4373040752351097,
+ "grad_norm": 0.4965567886829376,
+ "learning_rate": 4.65e-06,
+ "loss": 0.5279,
+ "step": 93
+ },
+ {
+ "epoch": 0.44200626959247646,
+ "grad_norm": 0.5125988125801086,
+ "learning_rate": 4.7e-06,
+ "loss": 0.5436,
+ "step": 94
+ },
+ {
+ "epoch": 0.44670846394984326,
+ "grad_norm": 0.557763934135437,
+ "learning_rate": 4.75e-06,
+ "loss": 0.5496,
+ "step": 95
+ },
+ {
+ "epoch": 0.45141065830721006,
+ "grad_norm": 0.6993274092674255,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.5498,
+ "step": 96
+ },
+ {
+ "epoch": 0.4561128526645768,
+ "grad_norm": 0.5485453009605408,
+ "learning_rate": 4.85e-06,
+ "loss": 0.5552,
+ "step": 97
+ },
+ {
+ "epoch": 0.4608150470219436,
+ "grad_norm": 1.9821522235870361,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.569,
+ "step": 98
+ },
+ {
+ "epoch": 0.46551724137931033,
+ "grad_norm": 0.6074144840240479,
+ "learning_rate": 4.95e-06,
+ "loss": 0.5546,
+ "step": 99
+ },
+ {
+ "epoch": 0.4702194357366771,
+ "grad_norm": 0.5404040813446045,
+ "learning_rate": 5e-06,
+ "loss": 0.5775,
+ "step": 100
+ },
+ {
+ "epoch": 0.47492163009404387,
+ "grad_norm": 0.500438928604126,
+ "learning_rate": 4.9999910183883085e-06,
+ "loss": 0.5569,
+ "step": 101
+ },
+ {
+ "epoch": 0.47962382445141066,
+ "grad_norm": 0.5036981701850891,
+ "learning_rate": 4.999964073617768e-06,
+ "loss": 0.5663,
+ "step": 102
+ },
+ {
+ "epoch": 0.4843260188087774,
+ "grad_norm": 0.4537642300128937,
+ "learning_rate": 4.999919165881985e-06,
+ "loss": 0.5527,
+ "step": 103
+ },
+ {
+ "epoch": 0.4890282131661442,
+ "grad_norm": 0.49653521180152893,
+ "learning_rate": 4.999856295503635e-06,
+ "loss": 0.563,
+ "step": 104
+ },
+ {
+ "epoch": 0.493730407523511,
+ "grad_norm": 0.46847566962242126,
+ "learning_rate": 4.9997754629344596e-06,
+ "loss": 0.5425,
+ "step": 105
+ },
+ {
+ "epoch": 0.49843260188087773,
+ "grad_norm": 0.5192411541938782,
+ "learning_rate": 4.999676668755263e-06,
+ "loss": 0.5315,
+ "step": 106
+ },
+ {
+ "epoch": 0.5031347962382445,
+ "grad_norm": 0.5170287489891052,
+ "learning_rate": 4.999559913675912e-06,
+ "loss": 0.5627,
+ "step": 107
+ },
+ {
+ "epoch": 0.5078369905956113,
+ "grad_norm": 0.47297438979148865,
+ "learning_rate": 4.999425198535325e-06,
+ "loss": 0.5432,
+ "step": 108
+ },
+ {
+ "epoch": 0.512539184952978,
+ "grad_norm": 0.4873776137828827,
+ "learning_rate": 4.999272524301469e-06,
+ "loss": 0.5473,
+ "step": 109
+ },
+ {
+ "epoch": 0.5172413793103449,
+ "grad_norm": 0.5432935357093811,
+ "learning_rate": 4.9991018920713505e-06,
+ "loss": 0.5642,
+ "step": 110
+ },
+ {
+ "epoch": 0.5219435736677116,
+ "grad_norm": 0.4850105345249176,
+ "learning_rate": 4.9989133030710154e-06,
+ "loss": 0.548,
+ "step": 111
+ },
+ {
+ "epoch": 0.5266457680250783,
+ "grad_norm": 0.9399585723876953,
+ "learning_rate": 4.9987067586555275e-06,
+ "loss": 0.5471,
+ "step": 112
+ },
+ {
+ "epoch": 0.5313479623824452,
+ "grad_norm": 0.5167811512947083,
+ "learning_rate": 4.998482260308969e-06,
+ "loss": 0.5648,
+ "step": 113
+ },
+ {
+ "epoch": 0.5360501567398119,
+ "grad_norm": 0.5069029927253723,
+ "learning_rate": 4.998239809644427e-06,
+ "loss": 0.5568,
+ "step": 114
+ },
+ {
+ "epoch": 0.5407523510971787,
+ "grad_norm": 0.8738563656806946,
+ "learning_rate": 4.9979794084039755e-06,
+ "loss": 0.5719,
+ "step": 115
+ },
+ {
+ "epoch": 0.5454545454545454,
+ "grad_norm": 0.5216553807258606,
+ "learning_rate": 4.997701058458677e-06,
+ "loss": 0.5309,
+ "step": 116
+ },
+ {
+ "epoch": 0.5501567398119123,
+ "grad_norm": 0.9678344130516052,
+ "learning_rate": 4.997404761808554e-06,
+ "loss": 0.5645,
+ "step": 117
+ },
+ {
+ "epoch": 0.554858934169279,
+ "grad_norm": 0.496598482131958,
+ "learning_rate": 4.9970905205825845e-06,
+ "loss": 0.5711,
+ "step": 118
+ },
+ {
+ "epoch": 0.5595611285266457,
+ "grad_norm": 0.4745199680328369,
+ "learning_rate": 4.996758337038683e-06,
+ "loss": 0.5613,
+ "step": 119
+ },
+ {
+ "epoch": 0.5642633228840125,
+ "grad_norm": 0.5595977902412415,
+ "learning_rate": 4.996408213563684e-06,
+ "loss": 0.5559,
+ "step": 120
+ },
+ {
+ "epoch": 0.5689655172413793,
+ "grad_norm": 0.4743712544441223,
+ "learning_rate": 4.996040152673326e-06,
+ "loss": 0.5228,
+ "step": 121
+ },
+ {
+ "epoch": 0.5736677115987461,
+ "grad_norm": 0.5418100953102112,
+ "learning_rate": 4.995654157012233e-06,
+ "loss": 0.536,
+ "step": 122
+ },
+ {
+ "epoch": 0.5783699059561128,
+ "grad_norm": 0.521977424621582,
+ "learning_rate": 4.995250229353895e-06,
+ "loss": 0.5305,
+ "step": 123
+ },
+ {
+ "epoch": 0.5830721003134797,
+ "grad_norm": 0.5062761902809143,
+ "learning_rate": 4.99482837260065e-06,
+ "loss": 0.5417,
+ "step": 124
+ },
+ {
+ "epoch": 0.5877742946708464,
+ "grad_norm": 0.5895913243293762,
+ "learning_rate": 4.99438858978366e-06,
+ "loss": 0.573,
+ "step": 125
+ },
+ {
+ "epoch": 0.5924764890282131,
+ "grad_norm": 0.5442466139793396,
+ "learning_rate": 4.993930884062892e-06,
+ "loss": 0.5563,
+ "step": 126
+ },
+ {
+ "epoch": 0.5971786833855799,
+ "grad_norm": 0.5130571722984314,
+ "learning_rate": 4.993455258727094e-06,
+ "loss": 0.5549,
+ "step": 127
+ },
+ {
+ "epoch": 0.6018808777429467,
+ "grad_norm": 0.5579081773757935,
+ "learning_rate": 4.992961717193773e-06,
+ "loss": 0.5554,
+ "step": 128
+ },
+ {
+ "epoch": 0.6065830721003135,
+ "grad_norm": 0.6375890374183655,
+ "learning_rate": 4.9924502630091655e-06,
+ "loss": 0.5626,
+ "step": 129
+ },
+ {
+ "epoch": 0.6112852664576802,
+ "grad_norm": 0.5129190683364868,
+ "learning_rate": 4.99192089984822e-06,
+ "loss": 0.5493,
+ "step": 130
+ },
+ {
+ "epoch": 0.6159874608150471,
+ "grad_norm": 0.5293419361114502,
+ "learning_rate": 4.9913736315145614e-06,
+ "loss": 0.5565,
+ "step": 131
+ },
+ {
+ "epoch": 0.6206896551724138,
+ "grad_norm": 0.6502572298049927,
+ "learning_rate": 4.990808461940474e-06,
+ "loss": 0.5358,
+ "step": 132
+ },
+ {
+ "epoch": 0.6253918495297806,
+ "grad_norm": 0.5450296998023987,
+ "learning_rate": 4.990225395186862e-06,
+ "loss": 0.5421,
+ "step": 133
+ },
+ {
+ "epoch": 0.6300940438871473,
+ "grad_norm": 0.45506399869918823,
+ "learning_rate": 4.9896244354432314e-06,
+ "loss": 0.5396,
+ "step": 134
+ },
+ {
+ "epoch": 0.6347962382445141,
+ "grad_norm": 0.5095545649528503,
+ "learning_rate": 4.98900558702765e-06,
+ "loss": 0.5486,
+ "step": 135
+ },
+ {
+ "epoch": 0.6394984326018809,
+ "grad_norm": 0.4836446940898895,
+ "learning_rate": 4.9883688543867225e-06,
+ "loss": 0.5596,
+ "step": 136
+ },
+ {
+ "epoch": 0.6442006269592476,
+ "grad_norm": 0.5253512859344482,
+ "learning_rate": 4.987714242095558e-06,
+ "loss": 0.5308,
+ "step": 137
+ },
+ {
+ "epoch": 0.6489028213166145,
+ "grad_norm": 0.8280164003372192,
+ "learning_rate": 4.9870417548577355e-06,
+ "loss": 0.5349,
+ "step": 138
+ },
+ {
+ "epoch": 0.6536050156739812,
+ "grad_norm": 0.4729730188846588,
+ "learning_rate": 4.9863513975052696e-06,
+ "loss": 0.5416,
+ "step": 139
+ },
+ {
+ "epoch": 0.658307210031348,
+ "grad_norm": 0.5932718515396118,
+ "learning_rate": 4.985643174998578e-06,
+ "loss": 0.5638,
+ "step": 140
+ },
+ {
+ "epoch": 0.6630094043887147,
+ "grad_norm": 0.5187026262283325,
+ "learning_rate": 4.984917092426445e-06,
+ "loss": 0.5507,
+ "step": 141
+ },
+ {
+ "epoch": 0.6677115987460815,
+ "grad_norm": 0.5024245977401733,
+ "learning_rate": 4.984173155005982e-06,
+ "loss": 0.5406,
+ "step": 142
+ },
+ {
+ "epoch": 0.6724137931034483,
+ "grad_norm": 0.4735509157180786,
+ "learning_rate": 4.983411368082597e-06,
+ "loss": 0.5431,
+ "step": 143
+ },
+ {
+ "epoch": 0.677115987460815,
+ "grad_norm": 0.5040024518966675,
+ "learning_rate": 4.982631737129948e-06,
+ "loss": 0.5291,
+ "step": 144
+ },
+ {
+ "epoch": 0.6818181818181818,
+ "grad_norm": 0.47764894366264343,
+ "learning_rate": 4.98183426774991e-06,
+ "loss": 0.5677,
+ "step": 145
+ },
+ {
+ "epoch": 0.6865203761755486,
+ "grad_norm": 0.5211489796638489,
+ "learning_rate": 4.981018965672529e-06,
+ "loss": 0.566,
+ "step": 146
+ },
+ {
+ "epoch": 0.6912225705329154,
+ "grad_norm": 1.022007942199707,
+ "learning_rate": 4.98018583675599e-06,
+ "loss": 0.5476,
+ "step": 147
+ },
+ {
+ "epoch": 0.6959247648902821,
+ "grad_norm": 0.5263912677764893,
+ "learning_rate": 4.979334886986562e-06,
+ "loss": 0.5473,
+ "step": 148
+ },
+ {
+ "epoch": 0.700626959247649,
+ "grad_norm": 0.5014091730117798,
+ "learning_rate": 4.978466122478567e-06,
+ "loss": 0.5642,
+ "step": 149
+ },
+ {
+ "epoch": 0.7053291536050157,
+ "grad_norm": 0.5003350973129272,
+ "learning_rate": 4.97757954947433e-06,
+ "loss": 0.5311,
+ "step": 150
+ },
+ {
+ "epoch": 0.7100313479623824,
+ "grad_norm": 0.5010690093040466,
+ "learning_rate": 4.976675174344132e-06,
+ "loss": 0.5469,
+ "step": 151
+ },
+ {
+ "epoch": 0.7147335423197492,
+ "grad_norm": 0.45779237151145935,
+ "learning_rate": 4.975753003586172e-06,
+ "loss": 0.5273,
+ "step": 152
+ },
+ {
+ "epoch": 0.719435736677116,
+ "grad_norm": 0.6231539845466614,
+ "learning_rate": 4.974813043826513e-06,
+ "loss": 0.5182,
+ "step": 153
+ },
+ {
+ "epoch": 0.7241379310344828,
+ "grad_norm": 0.5361394286155701,
+ "learning_rate": 4.973855301819039e-06,
+ "loss": 0.5372,
+ "step": 154
+ },
+ {
+ "epoch": 0.7288401253918495,
+ "grad_norm": 0.5193538665771484,
+ "learning_rate": 4.972879784445402e-06,
+ "loss": 0.5201,
+ "step": 155
+ },
+ {
+ "epoch": 0.7335423197492164,
+ "grad_norm": 0.47956809401512146,
+ "learning_rate": 4.971886498714978e-06,
+ "loss": 0.5402,
+ "step": 156
+ },
+ {
+ "epoch": 0.7382445141065831,
+ "grad_norm": 0.5303016901016235,
+ "learning_rate": 4.97087545176481e-06,
+ "loss": 0.5174,
+ "step": 157
+ },
+ {
+ "epoch": 0.7429467084639498,
+ "grad_norm": 0.5002286434173584,
+ "learning_rate": 4.9698466508595655e-06,
+ "loss": 0.5453,
+ "step": 158
+ },
+ {
+ "epoch": 0.7476489028213166,
+ "grad_norm": 0.6070297360420227,
+ "learning_rate": 4.9688001033914756e-06,
+ "loss": 0.5327,
+ "step": 159
+ },
+ {
+ "epoch": 0.7523510971786834,
+ "grad_norm": 0.5436793565750122,
+ "learning_rate": 4.967735816880286e-06,
+ "loss": 0.544,
+ "step": 160
+ },
+ {
+ "epoch": 0.7570532915360502,
+ "grad_norm": 0.538012683391571,
+ "learning_rate": 4.966653798973205e-06,
+ "loss": 0.5233,
+ "step": 161
+ },
+ {
+ "epoch": 0.7617554858934169,
+ "grad_norm": 0.4916169345378876,
+ "learning_rate": 4.965554057444842e-06,
+ "loss": 0.5168,
+ "step": 162
+ },
+ {
+ "epoch": 0.7664576802507836,
+ "grad_norm": 0.48281437158584595,
+ "learning_rate": 4.964436600197161e-06,
+ "loss": 0.5393,
+ "step": 163
+ },
+ {
+ "epoch": 0.7711598746081505,
+ "grad_norm": 0.5184990167617798,
+ "learning_rate": 4.963301435259413e-06,
+ "loss": 0.5085,
+ "step": 164
+ },
+ {
+ "epoch": 0.7758620689655172,
+ "grad_norm": 0.4706438183784485,
+ "learning_rate": 4.962148570788088e-06,
+ "loss": 0.5299,
+ "step": 165
+ },
+ {
+ "epoch": 0.780564263322884,
+ "grad_norm": 0.6550764441490173,
+ "learning_rate": 4.96097801506685e-06,
+ "loss": 0.5192,
+ "step": 166
+ },
+ {
+ "epoch": 0.7852664576802508,
+ "grad_norm": 0.5386581420898438,
+ "learning_rate": 4.959789776506482e-06,
+ "loss": 0.5258,
+ "step": 167
+ },
+ {
+ "epoch": 0.7899686520376176,
+ "grad_norm": 0.5060779452323914,
+ "learning_rate": 4.958583863644821e-06,
+ "loss": 0.5512,
+ "step": 168
+ },
+ {
+ "epoch": 0.7946708463949843,
+ "grad_norm": 0.47050032019615173,
+ "learning_rate": 4.9573602851466985e-06,
+ "loss": 0.5176,
+ "step": 169
+ },
+ {
+ "epoch": 0.799373040752351,
+ "grad_norm": 7.3139567375183105,
+ "learning_rate": 4.9561190498038815e-06,
+ "loss": 0.5381,
+ "step": 170
+ },
+ {
+ "epoch": 0.8040752351097179,
+ "grad_norm": 0.620528519153595,
+ "learning_rate": 4.954860166535005e-06,
+ "loss": 0.5299,
+ "step": 171
+ },
+ {
+ "epoch": 0.8087774294670846,
+ "grad_norm": 0.45067766308784485,
+ "learning_rate": 4.95358364438551e-06,
+ "loss": 0.5328,
+ "step": 172
+ },
+ {
+ "epoch": 0.8134796238244514,
+ "grad_norm": 0.6771508455276489,
+ "learning_rate": 4.952289492527576e-06,
+ "loss": 0.5601,
+ "step": 173
+ },
+ {
+ "epoch": 0.8181818181818182,
+ "grad_norm": 0.518925130367279,
+ "learning_rate": 4.9509777202600605e-06,
+ "loss": 0.494,
+ "step": 174
+ },
+ {
+ "epoch": 0.822884012539185,
+ "grad_norm": 0.5191988945007324,
+ "learning_rate": 4.949648337008425e-06,
+ "loss": 0.5425,
+ "step": 175
+ },
+ {
+ "epoch": 0.8275862068965517,
+ "grad_norm": 0.8600963354110718,
+ "learning_rate": 4.948301352324674e-06,
+ "loss": 0.5332,
+ "step": 176
+ },
+ {
+ "epoch": 0.8322884012539185,
+ "grad_norm": 0.5405915379524231,
+ "learning_rate": 4.946936775887281e-06,
+ "loss": 0.5276,
+ "step": 177
+ },
+ {
+ "epoch": 0.8369905956112853,
+ "grad_norm": 0.48730772733688354,
+ "learning_rate": 4.945554617501124e-06,
+ "loss": 0.5217,
+ "step": 178
+ },
+ {
+ "epoch": 0.841692789968652,
+ "grad_norm": 0.5092865824699402,
+ "learning_rate": 4.944154887097411e-06,
+ "loss": 0.5534,
+ "step": 179
+ },
+ {
+ "epoch": 0.8463949843260188,
+ "grad_norm": 0.4994933605194092,
+ "learning_rate": 4.942737594733608e-06,
+ "loss": 0.5242,
+ "step": 180
+ },
+ {
+ "epoch": 0.8510971786833855,
+ "grad_norm": 0.4554043412208557,
+ "learning_rate": 4.941302750593373e-06,
+ "loss": 0.5424,
+ "step": 181
+ },
+ {
+ "epoch": 0.8557993730407524,
+ "grad_norm": 0.4865265488624573,
+ "learning_rate": 4.939850364986475e-06,
+ "loss": 0.482,
+ "step": 182
+ },
+ {
+ "epoch": 0.8605015673981191,
+ "grad_norm": 0.5013875365257263,
+ "learning_rate": 4.938380448348725e-06,
+ "loss": 0.4908,
+ "step": 183
+ },
+ {
+ "epoch": 0.8652037617554859,
+ "grad_norm": 0.4997917115688324,
+ "learning_rate": 4.9368930112419e-06,
+ "loss": 0.5336,
+ "step": 184
+ },
+ {
+ "epoch": 0.8699059561128527,
+ "grad_norm": 0.4783482551574707,
+ "learning_rate": 4.935388064353665e-06,
+ "loss": 0.5338,
+ "step": 185
+ },
+ {
+ "epoch": 0.8746081504702194,
+ "grad_norm": 0.7221089005470276,
+ "learning_rate": 4.9338656184975e-06,
+ "loss": 0.5327,
+ "step": 186
+ },
+ {
+ "epoch": 0.8793103448275862,
+ "grad_norm": 0.48115843534469604,
+ "learning_rate": 4.932325684612618e-06,
+ "loss": 0.5408,
+ "step": 187
+ },
+ {
+ "epoch": 0.8840125391849529,
+ "grad_norm": 0.4940219223499298,
+ "learning_rate": 4.93076827376389e-06,
+ "loss": 0.5455,
+ "step": 188
+ },
+ {
+ "epoch": 0.8887147335423198,
+ "grad_norm": 0.4754747450351715,
+ "learning_rate": 4.9291933971417635e-06,
+ "loss": 0.542,
+ "step": 189
+ },
+ {
+ "epoch": 0.8934169278996865,
+ "grad_norm": 0.548713207244873,
+ "learning_rate": 4.9276010660621835e-06,
+ "loss": 0.5292,
+ "step": 190
+ },
+ {
+ "epoch": 0.8981191222570533,
+ "grad_norm": 0.7292612195014954,
+ "learning_rate": 4.925991291966508e-06,
+ "loss": 0.5073,
+ "step": 191
+ },
+ {
+ "epoch": 0.9028213166144201,
+ "grad_norm": 0.5254770517349243,
+ "learning_rate": 4.92436408642143e-06,
+ "loss": 0.5451,
+ "step": 192
+ },
+ {
+ "epoch": 0.9075235109717869,
+ "grad_norm": 0.47938767075538635,
+ "learning_rate": 4.9227194611188934e-06,
+ "loss": 0.5204,
+ "step": 193
+ },
+ {
+ "epoch": 0.9122257053291536,
+ "grad_norm": 0.6740232706069946,
+ "learning_rate": 4.921057427876007e-06,
+ "loss": 0.4928,
+ "step": 194
+ },
+ {
+ "epoch": 0.9169278996865203,
+ "grad_norm": 0.5455343723297119,
+ "learning_rate": 4.919377998634959e-06,
+ "loss": 0.5468,
+ "step": 195
+ },
+ {
+ "epoch": 0.9216300940438872,
+ "grad_norm": 0.5001958012580872,
+ "learning_rate": 4.917681185462934e-06,
+ "loss": 0.5339,
+ "step": 196
+ },
+ {
+ "epoch": 0.9263322884012539,
+ "grad_norm": 0.5084257125854492,
+ "learning_rate": 4.915967000552028e-06,
+ "loss": 0.5259,
+ "step": 197
+ },
+ {
+ "epoch": 0.9310344827586207,
+ "grad_norm": 0.4807164967060089,
+ "learning_rate": 4.914235456219154e-06,
+ "loss": 0.5204,
+ "step": 198
+ },
+ {
+ "epoch": 0.9357366771159875,
+ "grad_norm": 0.6099370718002319,
+ "learning_rate": 4.912486564905959e-06,
+ "loss": 0.544,
+ "step": 199
+ },
+ {
+ "epoch": 0.9404388714733543,
+ "grad_norm": 0.47461947798728943,
+ "learning_rate": 4.910720339178735e-06,
+ "loss": 0.5295,
+ "step": 200
+ },
+ {
+ "epoch": 0.945141065830721,
+ "grad_norm": 0.500136137008667,
+ "learning_rate": 4.908936791728323e-06,
+ "loss": 0.5321,
+ "step": 201
+ },
+ {
+ "epoch": 0.9498432601880877,
+ "grad_norm": 0.5235631465911865,
+ "learning_rate": 4.907135935370027e-06,
+ "loss": 0.5338,
+ "step": 202
+ },
+ {
+ "epoch": 0.9545454545454546,
+ "grad_norm": 0.9285804629325867,
+ "learning_rate": 4.905317783043523e-06,
+ "loss": 0.5393,
+ "step": 203
+ },
+ {
+ "epoch": 0.9592476489028213,
+ "grad_norm": 0.4834178388118744,
+ "learning_rate": 4.9034823478127605e-06,
+ "loss": 0.5211,
+ "step": 204
+ },
+ {
+ "epoch": 0.9639498432601881,
+ "grad_norm": 0.4830580949783325,
+ "learning_rate": 4.901629642865872e-06,
+ "loss": 0.4986,
+ "step": 205
+ },
+ {
+ "epoch": 0.9686520376175548,
+ "grad_norm": 0.49718615412712097,
+ "learning_rate": 4.89975968151508e-06,
+ "loss": 0.5204,
+ "step": 206
+ },
+ {
+ "epoch": 0.9733542319749217,
+ "grad_norm": 0.5056726336479187,
+ "learning_rate": 4.8978724771965965e-06,
+ "loss": 0.5133,
+ "step": 207
+ },
+ {
+ "epoch": 0.9780564263322884,
+ "grad_norm": 0.7357563376426697,
+ "learning_rate": 4.895968043470532e-06,
+ "loss": 0.5307,
+ "step": 208
+ },
+ {
+ "epoch": 0.9827586206896551,
+ "grad_norm": 0.515610933303833,
+ "learning_rate": 4.894046394020794e-06,
+ "loss": 0.4955,
+ "step": 209
+ },
+ {
+ "epoch": 0.987460815047022,
+ "grad_norm": 0.5124618411064148,
+ "learning_rate": 4.892107542654988e-06,
+ "loss": 0.526,
+ "step": 210
+ },
+ {
+ "epoch": 0.9921630094043887,
+ "grad_norm": 0.5059565901756287,
+ "learning_rate": 4.890151503304325e-06,
+ "loss": 0.5473,
+ "step": 211
+ },
+ {
+ "epoch": 0.9968652037617555,
+ "grad_norm": 0.4806717336177826,
+ "learning_rate": 4.88817829002351e-06,
+ "loss": 0.5212,
+ "step": 212
+ },
+ {
+ "epoch": 1.0047021943573669,
+ "grad_norm": 0.9454345703125,
+ "learning_rate": 4.886187916990653e-06,
+ "loss": 1.0566,
+ "step": 213
+ },
+ {
+ "epoch": 1.0094043887147335,
+ "grad_norm": 0.4871070086956024,
+ "learning_rate": 4.884180398507163e-06,
+ "loss": 0.503,
+ "step": 214
+ },
+ {
+ "epoch": 1.0141065830721003,
+ "grad_norm": 0.45102012157440186,
+ "learning_rate": 4.882155748997636e-06,
+ "loss": 0.4954,
+ "step": 215
+ },
+ {
+ "epoch": 1.0188087774294672,
+ "grad_norm": 0.49910685420036316,
+ "learning_rate": 4.8801139830097685e-06,
+ "loss": 0.5019,
+ "step": 216
+ },
+ {
+ "epoch": 1.0235109717868338,
+ "grad_norm": 0.5155763030052185,
+ "learning_rate": 4.878055115214238e-06,
+ "loss": 0.5102,
+ "step": 217
+ },
+ {
+ "epoch": 1.0282131661442007,
+ "grad_norm": 0.4567059874534607,
+ "learning_rate": 4.875979160404607e-06,
+ "loss": 0.5069,
+ "step": 218
+ },
+ {
+ "epoch": 1.0329153605015673,
+ "grad_norm": 0.4782896935939789,
+ "learning_rate": 4.873886133497209e-06,
+ "loss": 0.5182,
+ "step": 219
+ },
+ {
+ "epoch": 1.0376175548589341,
+ "grad_norm": 0.44995731115341187,
+ "learning_rate": 4.87177604953105e-06,
+ "loss": 0.513,
+ "step": 220
+ },
+ {
+ "epoch": 1.042319749216301,
+ "grad_norm": 0.470059871673584,
+ "learning_rate": 4.869648923667694e-06,
+ "loss": 0.468,
+ "step": 221
+ },
+ {
+ "epoch": 1.0470219435736676,
+ "grad_norm": 0.5356128215789795,
+ "learning_rate": 4.867504771191154e-06,
+ "loss": 0.4942,
+ "step": 222
+ },
+ {
+ "epoch": 1.0517241379310345,
+ "grad_norm": 0.5137870907783508,
+ "learning_rate": 4.865343607507788e-06,
+ "loss": 0.5022,
+ "step": 223
+ },
+ {
+ "epoch": 1.0564263322884013,
+ "grad_norm": 0.47419992089271545,
+ "learning_rate": 4.86316544814618e-06,
+ "loss": 0.5158,
+ "step": 224
+ },
+ {
+ "epoch": 1.061128526645768,
+ "grad_norm": 0.49087393283843994,
+ "learning_rate": 4.860970308757038e-06,
+ "loss": 0.4605,
+ "step": 225
+ },
+ {
+ "epoch": 1.0658307210031348,
+ "grad_norm": 0.4988348186016083,
+ "learning_rate": 4.858758205113072e-06,
+ "loss": 0.4912,
+ "step": 226
+ },
+ {
+ "epoch": 1.0705329153605017,
+ "grad_norm": 0.44543248414993286,
+ "learning_rate": 4.856529153108888e-06,
+ "loss": 0.524,
+ "step": 227
+ },
+ {
+ "epoch": 1.0752351097178683,
+ "grad_norm": 0.5953351259231567,
+ "learning_rate": 4.854283168760868e-06,
+ "loss": 0.5001,
+ "step": 228
+ },
+ {
+ "epoch": 1.0799373040752351,
+ "grad_norm": 0.5012004375457764,
+ "learning_rate": 4.85202026820706e-06,
+ "loss": 0.4968,
+ "step": 229
+ },
+ {
+ "epoch": 1.084639498432602,
+ "grad_norm": 0.5023937821388245,
+ "learning_rate": 4.84974046770706e-06,
+ "loss": 0.5345,
+ "step": 230
+ },
+ {
+ "epoch": 1.0893416927899686,
+ "grad_norm": 0.4705684185028076,
+ "learning_rate": 4.847443783641893e-06,
+ "loss": 0.4459,
+ "step": 231
+ },
+ {
+ "epoch": 1.0940438871473355,
+ "grad_norm": 0.5082476735115051,
+ "learning_rate": 4.845130232513901e-06,
+ "loss": 0.4905,
+ "step": 232
+ },
+ {
+ "epoch": 1.098746081504702,
+ "grad_norm": 0.5283995866775513,
+ "learning_rate": 4.842799830946615e-06,
+ "loss": 0.4878,
+ "step": 233
+ },
+ {
+ "epoch": 1.103448275862069,
+ "grad_norm": 0.6373623013496399,
+ "learning_rate": 4.840452595684646e-06,
+ "loss": 0.4867,
+ "step": 234
+ },
+ {
+ "epoch": 1.1081504702194358,
+ "grad_norm": 0.4624481201171875,
+ "learning_rate": 4.83808854359356e-06,
+ "loss": 0.4793,
+ "step": 235
+ },
+ {
+ "epoch": 1.1128526645768024,
+ "grad_norm": 0.4659098982810974,
+ "learning_rate": 4.835707691659753e-06,
+ "loss": 0.4827,
+ "step": 236
+ },
+ {
+ "epoch": 1.1175548589341693,
+ "grad_norm": 0.4920850396156311,
+ "learning_rate": 4.8333100569903365e-06,
+ "loss": 0.4932,
+ "step": 237
+ },
+ {
+ "epoch": 1.1222570532915361,
+ "grad_norm": 0.492286741733551,
+ "learning_rate": 4.8308956568130094e-06,
+ "loss": 0.5144,
+ "step": 238
+ },
+ {
+ "epoch": 1.1269592476489028,
+ "grad_norm": 0.5429807901382446,
+ "learning_rate": 4.828464508475934e-06,
+ "loss": 0.5054,
+ "step": 239
+ },
+ {
+ "epoch": 1.1316614420062696,
+ "grad_norm": 2.4671998023986816,
+ "learning_rate": 4.826016629447616e-06,
+ "loss": 0.5073,
+ "step": 240
+ },
+ {
+ "epoch": 1.1363636363636362,
+ "grad_norm": 0.4593118131160736,
+ "learning_rate": 4.823552037316775e-06,
+ "loss": 0.4856,
+ "step": 241
+ },
+ {
+ "epoch": 1.141065830721003,
+ "grad_norm": 0.6855646371841431,
+ "learning_rate": 4.821070749792218e-06,
+ "loss": 0.5388,
+ "step": 242
+ },
+ {
+ "epoch": 1.14576802507837,
+ "grad_norm": 0.5722374320030212,
+ "learning_rate": 4.818572784702713e-06,
+ "loss": 0.51,
+ "step": 243
+ },
+ {
+ "epoch": 1.1504702194357366,
+ "grad_norm": 0.4901357591152191,
+ "learning_rate": 4.816058159996863e-06,
+ "loss": 0.5201,
+ "step": 244
+ },
+ {
+ "epoch": 1.1551724137931034,
+ "grad_norm": 0.4655209481716156,
+ "learning_rate": 4.813526893742972e-06,
+ "loss": 0.501,
+ "step": 245
+ },
+ {
+ "epoch": 1.1598746081504703,
+ "grad_norm": 0.7608394622802734,
+ "learning_rate": 4.810979004128924e-06,
+ "loss": 0.4961,
+ "step": 246
+ },
+ {
+ "epoch": 1.164576802507837,
+ "grad_norm": 0.4857081472873688,
+ "learning_rate": 4.808414509462042e-06,
+ "loss": 0.5174,
+ "step": 247
+ },
+ {
+ "epoch": 1.1692789968652038,
+ "grad_norm": 0.46672946214675903,
+ "learning_rate": 4.80583342816896e-06,
+ "loss": 0.484,
+ "step": 248
+ },
+ {
+ "epoch": 1.1739811912225706,
+ "grad_norm": 0.46982088685035706,
+ "learning_rate": 4.803235778795496e-06,
+ "loss": 0.5236,
+ "step": 249
+ },
+ {
+ "epoch": 1.1786833855799372,
+ "grad_norm": 0.5086098909378052,
+ "learning_rate": 4.800621580006511e-06,
+ "loss": 0.4673,
+ "step": 250
+ },
+ {
+ "epoch": 1.183385579937304,
+ "grad_norm": 0.45968860387802124,
+ "learning_rate": 4.797990850585782e-06,
+ "loss": 0.5151,
+ "step": 251
+ },
+ {
+ "epoch": 1.188087774294671,
+ "grad_norm": 0.49544984102249146,
+ "learning_rate": 4.79534360943586e-06,
+ "loss": 0.494,
+ "step": 252
+ },
+ {
+ "epoch": 1.1927899686520376,
+ "grad_norm": 0.531892716884613,
+ "learning_rate": 4.792679875577937e-06,
+ "loss": 0.4778,
+ "step": 253
+ },
+ {
+ "epoch": 1.1974921630094044,
+ "grad_norm": 0.5013542175292969,
+ "learning_rate": 4.789999668151714e-06,
+ "loss": 0.5132,
+ "step": 254
+ },
+ {
+ "epoch": 1.2021943573667713,
+ "grad_norm": 0.46963250637054443,
+ "learning_rate": 4.7873030064152545e-06,
+ "loss": 0.4938,
+ "step": 255
+ },
+ {
+ "epoch": 1.206896551724138,
+ "grad_norm": 0.465285986661911,
+ "learning_rate": 4.784589909744856e-06,
+ "loss": 0.4898,
+ "step": 256
+ },
+ {
+ "epoch": 1.2115987460815048,
+ "grad_norm": 0.5183936357498169,
+ "learning_rate": 4.7818603976349005e-06,
+ "loss": 0.5004,
+ "step": 257
+ },
+ {
+ "epoch": 1.2163009404388714,
+ "grad_norm": 0.47324836254119873,
+ "learning_rate": 4.779114489697724e-06,
+ "loss": 0.4972,
+ "step": 258
+ },
+ {
+ "epoch": 1.2210031347962382,
+ "grad_norm": 0.5208264589309692,
+ "learning_rate": 4.776352205663469e-06,
+ "loss": 0.5023,
+ "step": 259
+ },
+ {
+ "epoch": 1.225705329153605,
+ "grad_norm": 0.5583804845809937,
+ "learning_rate": 4.773573565379947e-06,
+ "loss": 0.5099,
+ "step": 260
+ },
+ {
+ "epoch": 1.2304075235109717,
+ "grad_norm": 0.5016160011291504,
+ "learning_rate": 4.770778588812489e-06,
+ "loss": 0.4765,
+ "step": 261
+ },
+ {
+ "epoch": 1.2351097178683386,
+ "grad_norm": 0.50210040807724,
+ "learning_rate": 4.7679672960438135e-06,
+ "loss": 0.5029,
+ "step": 262
+ },
+ {
+ "epoch": 1.2398119122257054,
+ "grad_norm": 0.6636150479316711,
+ "learning_rate": 4.765139707273872e-06,
+ "loss": 0.4909,
+ "step": 263
+ },
+ {
+ "epoch": 1.244514106583072,
+ "grad_norm": 0.4798625111579895,
+ "learning_rate": 4.762295842819707e-06,
+ "loss": 0.5012,
+ "step": 264
+ },
+ {
+ "epoch": 1.249216300940439,
+ "grad_norm": 0.5282374024391174,
+ "learning_rate": 4.759435723115308e-06,
+ "loss": 0.4681,
+ "step": 265
+ },
+ {
+ "epoch": 1.2539184952978055,
+ "grad_norm": 0.5356930494308472,
+ "learning_rate": 4.756559368711463e-06,
+ "loss": 0.506,
+ "step": 266
+ },
+ {
+ "epoch": 1.2586206896551724,
+ "grad_norm": 0.4857093095779419,
+ "learning_rate": 4.75366680027561e-06,
+ "loss": 0.4889,
+ "step": 267
+ },
+ {
+ "epoch": 1.2633228840125392,
+ "grad_norm": 0.484018474817276,
+ "learning_rate": 4.7507580385916906e-06,
+ "loss": 0.4899,
+ "step": 268
+ },
+ {
+ "epoch": 1.2680250783699059,
+ "grad_norm": 0.49720871448516846,
+ "learning_rate": 4.747833104559999e-06,
+ "loss": 0.4654,
+ "step": 269
+ },
+ {
+ "epoch": 1.2727272727272727,
+ "grad_norm": 0.4631911516189575,
+ "learning_rate": 4.744892019197033e-06,
+ "loss": 0.4796,
+ "step": 270
+ },
+ {
+ "epoch": 1.2774294670846396,
+ "grad_norm": 0.5116872787475586,
+ "learning_rate": 4.74193480363534e-06,
+ "loss": 0.4883,
+ "step": 271
+ },
+ {
+ "epoch": 1.2821316614420062,
+ "grad_norm": 0.5275093913078308,
+ "learning_rate": 4.738961479123373e-06,
+ "loss": 0.496,
+ "step": 272
+ },
+ {
+ "epoch": 1.286833855799373,
+ "grad_norm": 0.5001885890960693,
+ "learning_rate": 4.735972067025326e-06,
+ "loss": 0.5012,
+ "step": 273
+ },
+ {
+ "epoch": 1.29153605015674,
+ "grad_norm": 0.5875861048698425,
+ "learning_rate": 4.732966588820991e-06,
+ "loss": 0.4951,
+ "step": 274
+ },
+ {
+ "epoch": 1.2962382445141065,
+ "grad_norm": 0.4893011748790741,
+ "learning_rate": 4.729945066105599e-06,
+ "loss": 0.4742,
+ "step": 275
+ },
+ {
+ "epoch": 1.3009404388714734,
+ "grad_norm": 0.4648543894290924,
+ "learning_rate": 4.726907520589664e-06,
+ "loss": 0.466,
+ "step": 276
+ },
+ {
+ "epoch": 1.3056426332288402,
+ "grad_norm": 0.5300162434577942,
+ "learning_rate": 4.72385397409883e-06,
+ "loss": 0.5072,
+ "step": 277
+ },
+ {
+ "epoch": 1.3103448275862069,
+ "grad_norm": 0.4667080044746399,
+ "learning_rate": 4.720784448573712e-06,
+ "loss": 0.4986,
+ "step": 278
+ },
+ {
+ "epoch": 1.3150470219435737,
+ "grad_norm": 0.5278895497322083,
+ "learning_rate": 4.717698966069739e-06,
+ "loss": 0.5269,
+ "step": 279
+ },
+ {
+ "epoch": 1.3197492163009406,
+ "grad_norm": 0.5325866937637329,
+ "learning_rate": 4.7145975487569965e-06,
+ "loss": 0.5074,
+ "step": 280
+ },
+ {
+ "epoch": 1.3244514106583072,
+ "grad_norm": 0.500861644744873,
+ "learning_rate": 4.711480218920064e-06,
+ "loss": 0.4695,
+ "step": 281
+ },
+ {
+ "epoch": 1.329153605015674,
+ "grad_norm": 0.5263222455978394,
+ "learning_rate": 4.708346998957859e-06,
+ "loss": 0.5173,
+ "step": 282
+ },
+ {
+ "epoch": 1.3338557993730409,
+ "grad_norm": 0.622900128364563,
+ "learning_rate": 4.705197911383473e-06,
+ "loss": 0.4905,
+ "step": 283
+ },
+ {
+ "epoch": 1.3385579937304075,
+ "grad_norm": 0.49273768067359924,
+ "learning_rate": 4.7020329788240115e-06,
+ "loss": 0.4743,
+ "step": 284
+ },
+ {
+ "epoch": 1.3432601880877744,
+ "grad_norm": 0.49558964371681213,
+ "learning_rate": 4.6988522240204325e-06,
+ "loss": 0.4824,
+ "step": 285
+ },
+ {
+ "epoch": 1.347962382445141,
+ "grad_norm": 0.4743976891040802,
+ "learning_rate": 4.695655669827377e-06,
+ "loss": 0.4977,
+ "step": 286
+ },
+ {
+ "epoch": 1.3526645768025078,
+ "grad_norm": 0.49542659521102905,
+ "learning_rate": 4.6924433392130135e-06,
+ "loss": 0.4924,
+ "step": 287
+ },
+ {
+ "epoch": 1.3573667711598745,
+ "grad_norm": 0.7385990619659424,
+ "learning_rate": 4.689215255258866e-06,
+ "loss": 0.5091,
+ "step": 288
+ },
+ {
+ "epoch": 1.3620689655172413,
+ "grad_norm": 0.4826123118400574,
+ "learning_rate": 4.685971441159653e-06,
+ "loss": 0.4791,
+ "step": 289
+ },
+ {
+ "epoch": 1.3667711598746082,
+ "grad_norm": 0.5389033555984497,
+ "learning_rate": 4.682711920223115e-06,
+ "loss": 0.4751,
+ "step": 290
+ },
+ {
+ "epoch": 1.3714733542319748,
+ "grad_norm": 0.5059546232223511,
+ "learning_rate": 4.679436715869856e-06,
+ "loss": 0.499,
+ "step": 291
+ },
+ {
+ "epoch": 1.3761755485893417,
+ "grad_norm": 0.5682849884033203,
+ "learning_rate": 4.676145851633166e-06,
+ "loss": 0.5143,
+ "step": 292
+ },
+ {
+ "epoch": 1.3808777429467085,
+ "grad_norm": 0.4754337668418884,
+ "learning_rate": 4.672839351158856e-06,
+ "loss": 0.4997,
+ "step": 293
+ },
+ {
+ "epoch": 1.3855799373040751,
+ "grad_norm": 0.5227643847465515,
+ "learning_rate": 4.669517238205089e-06,
+ "loss": 0.4834,
+ "step": 294
+ },
+ {
+ "epoch": 1.390282131661442,
+ "grad_norm": 0.4954044222831726,
+ "learning_rate": 4.666179536642208e-06,
+ "loss": 0.483,
+ "step": 295
+ },
+ {
+ "epoch": 1.3949843260188088,
+ "grad_norm": 0.4909021556377411,
+ "learning_rate": 4.662826270452565e-06,
+ "loss": 0.4808,
+ "step": 296
+ },
+ {
+ "epoch": 1.3996865203761755,
+ "grad_norm": 0.4666971266269684,
+ "learning_rate": 4.659457463730347e-06,
+ "loss": 0.488,
+ "step": 297
+ },
+ {
+ "epoch": 1.4043887147335423,
+ "grad_norm": 0.5064187049865723,
+ "learning_rate": 4.6560731406814056e-06,
+ "loss": 0.5046,
+ "step": 298
+ },
+ {
+ "epoch": 1.4090909090909092,
+ "grad_norm": 0.4958318769931793,
+ "learning_rate": 4.65267332562308e-06,
+ "loss": 0.5102,
+ "step": 299
+ },
+ {
+ "epoch": 1.4137931034482758,
+ "grad_norm": 0.5080632567405701,
+ "learning_rate": 4.649258042984026e-06,
+ "loss": 0.5055,
+ "step": 300
+ },
+ {
+ "epoch": 1.4184952978056427,
+ "grad_norm": 0.46236541867256165,
+ "learning_rate": 4.6458273173040395e-06,
+ "loss": 0.4606,
+ "step": 301
+ },
+ {
+ "epoch": 1.4231974921630095,
+ "grad_norm": 1.8524898290634155,
+ "learning_rate": 4.642381173233874e-06,
+ "loss": 0.5002,
+ "step": 302
+ },
+ {
+ "epoch": 1.4278996865203761,
+ "grad_norm": 0.5202615261077881,
+ "learning_rate": 4.638919635535073e-06,
+ "loss": 0.4562,
+ "step": 303
+ },
+ {
+ "epoch": 1.432601880877743,
+ "grad_norm": 0.5293647050857544,
+ "learning_rate": 4.635442729079788e-06,
+ "loss": 0.4806,
+ "step": 304
+ },
+ {
+ "epoch": 1.4373040752351098,
+ "grad_norm": 0.5165356993675232,
+ "learning_rate": 4.6319504788505956e-06,
+ "loss": 0.4775,
+ "step": 305
+ },
+ {
+ "epoch": 1.4420062695924765,
+ "grad_norm": 0.5092841386795044,
+ "learning_rate": 4.628442909940325e-06,
+ "loss": 0.4892,
+ "step": 306
+ },
+ {
+ "epoch": 1.4467084639498433,
+ "grad_norm": 0.511424720287323,
+ "learning_rate": 4.624920047551874e-06,
+ "loss": 0.506,
+ "step": 307
+ },
+ {
+ "epoch": 1.4514106583072102,
+ "grad_norm": 0.5631566643714905,
+ "learning_rate": 4.621381916998029e-06,
+ "loss": 0.4741,
+ "step": 308
+ },
+ {
+ "epoch": 1.4561128526645768,
+ "grad_norm": 0.4748315215110779,
+ "learning_rate": 4.6178285437012806e-06,
+ "loss": 0.5084,
+ "step": 309
+ },
+ {
+ "epoch": 1.4608150470219436,
+ "grad_norm": 0.47158119082450867,
+ "learning_rate": 4.6142599531936435e-06,
+ "loss": 0.4697,
+ "step": 310
+ },
+ {
+ "epoch": 1.4655172413793103,
+ "grad_norm": 0.5358107089996338,
+ "learning_rate": 4.610676171116475e-06,
+ "loss": 0.491,
+ "step": 311
+ },
+ {
+ "epoch": 1.4702194357366771,
+ "grad_norm": 0.47717440128326416,
+ "learning_rate": 4.607077223220286e-06,
+ "loss": 0.4948,
+ "step": 312
+ },
+ {
+ "epoch": 1.4749216300940438,
+ "grad_norm": 0.5041193962097168,
+ "learning_rate": 4.603463135364556e-06,
+ "loss": 0.4648,
+ "step": 313
+ },
+ {
+ "epoch": 1.4796238244514106,
+ "grad_norm": 0.9311274290084839,
+ "learning_rate": 4.5998339335175555e-06,
+ "loss": 0.4866,
+ "step": 314
+ },
+ {
+ "epoch": 1.4843260188087775,
+ "grad_norm": 0.47408604621887207,
+ "learning_rate": 4.596189643756147e-06,
+ "loss": 0.4634,
+ "step": 315
+ },
+ {
+ "epoch": 1.489028213166144,
+ "grad_norm": 0.5052632093429565,
+ "learning_rate": 4.592530292265609e-06,
+ "loss": 0.4843,
+ "step": 316
+ },
+ {
+ "epoch": 1.493730407523511,
+ "grad_norm": 0.5100846886634827,
+ "learning_rate": 4.58885590533944e-06,
+ "loss": 0.4942,
+ "step": 317
+ },
+ {
+ "epoch": 1.4984326018808778,
+ "grad_norm": 0.5132214426994324,
+ "learning_rate": 4.585166509379173e-06,
+ "loss": 0.5135,
+ "step": 318
+ },
+ {
+ "epoch": 1.5031347962382444,
+ "grad_norm": 11.112855911254883,
+ "learning_rate": 4.581462130894186e-06,
+ "loss": 0.4933,
+ "step": 319
+ },
+ {
+ "epoch": 1.5078369905956113,
+ "grad_norm": 0.4873805642127991,
+ "learning_rate": 4.57774279650151e-06,
+ "loss": 0.483,
+ "step": 320
+ },
+ {
+ "epoch": 1.5125391849529781,
+ "grad_norm": 0.5026459693908691,
+ "learning_rate": 4.574008532925638e-06,
+ "loss": 0.5075,
+ "step": 321
+ },
+ {
+ "epoch": 1.5172413793103448,
+ "grad_norm": 0.489947110414505,
+ "learning_rate": 4.570259366998336e-06,
+ "loss": 0.4954,
+ "step": 322
+ },
+ {
+ "epoch": 1.5219435736677116,
+ "grad_norm": 0.48120853304862976,
+ "learning_rate": 4.566495325658445e-06,
+ "loss": 0.5221,
+ "step": 323
+ },
+ {
+ "epoch": 1.5266457680250785,
+ "grad_norm": 0.4880066514015198,
+ "learning_rate": 4.5627164359516915e-06,
+ "loss": 0.5031,
+ "step": 324
+ },
+ {
+ "epoch": 1.531347962382445,
+ "grad_norm": 0.5048410892486572,
+ "learning_rate": 4.558922725030491e-06,
+ "loss": 0.4757,
+ "step": 325
+ },
+ {
+ "epoch": 1.536050156739812,
+ "grad_norm": 0.7033756375312805,
+ "learning_rate": 4.555114220153755e-06,
+ "loss": 0.4285,
+ "step": 326
+ },
+ {
+ "epoch": 1.5407523510971788,
+ "grad_norm": 0.4716516435146332,
+ "learning_rate": 4.551290948686693e-06,
+ "loss": 0.5121,
+ "step": 327
+ },
+ {
+ "epoch": 1.5454545454545454,
+ "grad_norm": 0.4782696068286896,
+ "learning_rate": 4.547452938100615e-06,
+ "loss": 0.5176,
+ "step": 328
+ },
+ {
+ "epoch": 1.5501567398119123,
+ "grad_norm": 0.5119273066520691,
+ "learning_rate": 4.54360021597274e-06,
+ "loss": 0.4941,
+ "step": 329
+ },
+ {
+ "epoch": 1.5548589341692791,
+ "grad_norm": 0.5010069608688354,
+ "learning_rate": 4.539732809985989e-06,
+ "loss": 0.4862,
+ "step": 330
+ },
+ {
+ "epoch": 1.5595611285266457,
+ "grad_norm": 0.5129932165145874,
+ "learning_rate": 4.535850747928796e-06,
+ "loss": 0.4978,
+ "step": 331
+ },
+ {
+ "epoch": 1.5642633228840124,
+ "grad_norm": 0.4957594573497772,
+ "learning_rate": 4.531954057694897e-06,
+ "loss": 0.4814,
+ "step": 332
+ },
+ {
+ "epoch": 1.5689655172413794,
+ "grad_norm": 0.5642824172973633,
+ "learning_rate": 4.5280427672831414e-06,
+ "loss": 0.4888,
+ "step": 333
+ },
+ {
+ "epoch": 1.573667711598746,
+ "grad_norm": 0.4562854468822479,
+ "learning_rate": 4.524116904797281e-06,
+ "loss": 0.4648,
+ "step": 334
+ },
+ {
+ "epoch": 1.5783699059561127,
+ "grad_norm": 0.4849218428134918,
+ "learning_rate": 4.520176498445774e-06,
+ "loss": 0.476,
+ "step": 335
+ },
+ {
+ "epoch": 1.5830721003134798,
+ "grad_norm": 0.5046947002410889,
+ "learning_rate": 4.516221576541581e-06,
+ "loss": 0.4776,
+ "step": 336
+ },
+ {
+ "epoch": 1.5877742946708464,
+ "grad_norm": 0.48211777210235596,
+ "learning_rate": 4.512252167501959e-06,
+ "loss": 0.479,
+ "step": 337
+ },
+ {
+ "epoch": 1.592476489028213,
+ "grad_norm": 0.4812171459197998,
+ "learning_rate": 4.508268299848262e-06,
+ "loss": 0.4849,
+ "step": 338
+ },
+ {
+ "epoch": 1.59717868338558,
+ "grad_norm": 0.5865142345428467,
+ "learning_rate": 4.50427000220573e-06,
+ "loss": 0.499,
+ "step": 339
+ },
+ {
+ "epoch": 1.6018808777429467,
+ "grad_norm": 0.49277785420417786,
+ "learning_rate": 4.50025730330329e-06,
+ "loss": 0.475,
+ "step": 340
+ },
+ {
+ "epoch": 1.6065830721003134,
+ "grad_norm": 0.46771496534347534,
+ "learning_rate": 4.4962302319733445e-06,
+ "loss": 0.494,
+ "step": 341
+ },
+ {
+ "epoch": 1.6112852664576802,
+ "grad_norm": 0.5189441442489624,
+ "learning_rate": 4.492188817151565e-06,
+ "loss": 0.5275,
+ "step": 342
+ },
+ {
+ "epoch": 1.615987460815047,
+ "grad_norm": 0.48845574259757996,
+ "learning_rate": 4.488133087876688e-06,
+ "loss": 0.4676,
+ "step": 343
+ },
+ {
+ "epoch": 1.6206896551724137,
+ "grad_norm": 0.47189632058143616,
+ "learning_rate": 4.484063073290301e-06,
+ "loss": 0.4642,
+ "step": 344
+ },
+ {
+ "epoch": 1.6253918495297806,
+ "grad_norm": 0.5442587733268738,
+ "learning_rate": 4.479978802636637e-06,
+ "loss": 0.4981,
+ "step": 345
+ },
+ {
+ "epoch": 1.6300940438871474,
+ "grad_norm": 0.5048685073852539,
+ "learning_rate": 4.475880305262362e-06,
+ "loss": 0.5037,
+ "step": 346
+ },
+ {
+ "epoch": 1.634796238244514,
+ "grad_norm": 0.4781409800052643,
+ "learning_rate": 4.471767610616366e-06,
+ "loss": 0.4932,
+ "step": 347
+ },
+ {
+ "epoch": 1.6394984326018809,
+ "grad_norm": 0.47388938069343567,
+ "learning_rate": 4.467640748249549e-06,
+ "loss": 0.4687,
+ "step": 348
+ },
+ {
+ "epoch": 1.6442006269592477,
+ "grad_norm": 0.529712438583374,
+ "learning_rate": 4.4634997478146125e-06,
+ "loss": 0.487,
+ "step": 349
+ },
+ {
+ "epoch": 1.6489028213166144,
+ "grad_norm": 0.5114791393280029,
+ "learning_rate": 4.459344639065842e-06,
+ "loss": 0.4809,
+ "step": 350
+ },
+ {
+ "epoch": 1.6536050156739812,
+ "grad_norm": 0.45415258407592773,
+ "learning_rate": 4.455175451858897e-06,
+ "loss": 0.4901,
+ "step": 351
+ },
+ {
+ "epoch": 1.658307210031348,
+ "grad_norm": 0.5842339396476746,
+ "learning_rate": 4.450992216150592e-06,
+ "loss": 0.499,
+ "step": 352
+ },
+ {
+ "epoch": 1.6630094043887147,
+ "grad_norm": 0.48795560002326965,
+ "learning_rate": 4.446794961998689e-06,
+ "loss": 0.4659,
+ "step": 353
+ },
+ {
+ "epoch": 1.6677115987460815,
+ "grad_norm": 0.5531855225563049,
+ "learning_rate": 4.442583719561671e-06,
+ "loss": 0.4923,
+ "step": 354
+ },
+ {
+ "epoch": 1.6724137931034484,
+ "grad_norm": 0.5827644467353821,
+ "learning_rate": 4.438358519098536e-06,
+ "loss": 0.4991,
+ "step": 355
+ },
+ {
+ "epoch": 1.677115987460815,
+ "grad_norm": 0.5260423421859741,
+ "learning_rate": 4.4341193909685685e-06,
+ "loss": 0.4843,
+ "step": 356
+ },
+ {
+ "epoch": 1.6818181818181817,
+ "grad_norm": 0.4969344437122345,
+ "learning_rate": 4.429866365631134e-06,
+ "loss": 0.4915,
+ "step": 357
+ },
+ {
+ "epoch": 1.6865203761755487,
+ "grad_norm": 0.4725005030632019,
+ "learning_rate": 4.425599473645447e-06,
+ "loss": 0.4804,
+ "step": 358
+ },
+ {
+ "epoch": 1.6912225705329154,
+ "grad_norm": 0.47171467542648315,
+ "learning_rate": 4.421318745670364e-06,
+ "loss": 0.4823,
+ "step": 359
+ },
+ {
+ "epoch": 1.695924764890282,
+ "grad_norm": 0.4839799106121063,
+ "learning_rate": 4.4170242124641524e-06,
+ "loss": 0.4585,
+ "step": 360
+ },
+ {
+ "epoch": 1.700626959247649,
+ "grad_norm": 0.4786856472492218,
+ "learning_rate": 4.412715904884277e-06,
+ "loss": 0.49,
+ "step": 361
+ },
+ {
+ "epoch": 1.7053291536050157,
+ "grad_norm": 0.49980080127716064,
+ "learning_rate": 4.4083938538871735e-06,
+ "loss": 0.4675,
+ "step": 362
+ },
+ {
+ "epoch": 1.7100313479623823,
+ "grad_norm": 0.5201369524002075,
+ "learning_rate": 4.4040580905280295e-06,
+ "loss": 0.4862,
+ "step": 363
+ },
+ {
+ "epoch": 1.7147335423197492,
+ "grad_norm": 0.7051575183868408,
+ "learning_rate": 4.3997086459605586e-06,
+ "loss": 0.4822,
+ "step": 364
+ },
+ {
+ "epoch": 1.719435736677116,
+ "grad_norm": 0.48206666111946106,
+ "learning_rate": 4.395345551436779e-06,
+ "loss": 0.5076,
+ "step": 365
+ },
+ {
+ "epoch": 1.7241379310344827,
+ "grad_norm": 0.4817257821559906,
+ "learning_rate": 4.390968838306788e-06,
+ "loss": 0.4623,
+ "step": 366
+ },
+ {
+ "epoch": 1.7288401253918495,
+ "grad_norm": 0.5547840595245361,
+ "learning_rate": 4.386578538018535e-06,
+ "loss": 0.461,
+ "step": 367
+ },
+ {
+ "epoch": 1.7335423197492164,
+ "grad_norm": 0.5085346698760986,
+ "learning_rate": 4.382174682117598e-06,
+ "loss": 0.5068,
+ "step": 368
+ },
+ {
+ "epoch": 1.738244514106583,
+ "grad_norm": 0.4870692193508148,
+ "learning_rate": 4.377757302246956e-06,
+ "loss": 0.4403,
+ "step": 369
+ },
+ {
+ "epoch": 1.7429467084639498,
+ "grad_norm": 0.49482715129852295,
+ "learning_rate": 4.373326430146762e-06,
+ "loss": 0.4986,
+ "step": 370
+ },
+ {
+ "epoch": 1.7476489028213167,
+ "grad_norm": 0.5474854707717896,
+ "learning_rate": 4.368882097654113e-06,
+ "loss": 0.4938,
+ "step": 371
+ },
+ {
+ "epoch": 1.7523510971786833,
+ "grad_norm": 0.5055244565010071,
+ "learning_rate": 4.364424336702825e-06,
+ "loss": 0.4711,
+ "step": 372
+ },
+ {
+ "epoch": 1.7570532915360502,
+ "grad_norm": 0.48241329193115234,
+ "learning_rate": 4.3599531793232e-06,
+ "loss": 0.4856,
+ "step": 373
+ },
+ {
+ "epoch": 1.761755485893417,
+ "grad_norm": 0.4932602047920227,
+ "learning_rate": 4.355468657641797e-06,
+ "loss": 0.4818,
+ "step": 374
+ },
+ {
+ "epoch": 1.7664576802507836,
+ "grad_norm": 0.5512160658836365,
+ "learning_rate": 4.3509708038812035e-06,
+ "loss": 0.4864,
+ "step": 375
+ },
+ {
+ "epoch": 1.7711598746081505,
+ "grad_norm": 0.47026327252388,
+ "learning_rate": 4.346459650359798e-06,
+ "loss": 0.4825,
+ "step": 376
+ },
+ {
+ "epoch": 1.7758620689655173,
+ "grad_norm": 0.4831086993217468,
+ "learning_rate": 4.341935229491525e-06,
+ "loss": 0.4541,
+ "step": 377
+ },
+ {
+ "epoch": 1.780564263322884,
+ "grad_norm": 0.5045217871665955,
+ "learning_rate": 4.337397573785659e-06,
+ "loss": 0.5025,
+ "step": 378
+ },
+ {
+ "epoch": 1.7852664576802508,
+ "grad_norm": 0.5657753348350525,
+ "learning_rate": 4.332846715846566e-06,
+ "loss": 0.4698,
+ "step": 379
+ },
+ {
+ "epoch": 1.7899686520376177,
+ "grad_norm": 0.49546748399734497,
+ "learning_rate": 4.328282688373479e-06,
+ "loss": 0.4911,
+ "step": 380
+ },
+ {
+ "epoch": 1.7946708463949843,
+ "grad_norm": 0.5037291049957275,
+ "learning_rate": 4.323705524160258e-06,
+ "loss": 0.4877,
+ "step": 381
+ },
+ {
+ "epoch": 1.799373040752351,
+ "grad_norm": 0.5256901383399963,
+ "learning_rate": 4.319115256095149e-06,
+ "loss": 0.4662,
+ "step": 382
+ },
+ {
+ "epoch": 1.804075235109718,
+ "grad_norm": 0.4890702962875366,
+ "learning_rate": 4.314511917160557e-06,
+ "loss": 0.4683,
+ "step": 383
+ },
+ {
+ "epoch": 1.8087774294670846,
+ "grad_norm": 0.4724109470844269,
+ "learning_rate": 4.3098955404328045e-06,
+ "loss": 0.4602,
+ "step": 384
+ },
+ {
+ "epoch": 1.8134796238244513,
+ "grad_norm": 0.4933278560638428,
+ "learning_rate": 4.305266159081895e-06,
+ "loss": 0.4806,
+ "step": 385
+ },
+ {
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.5068219304084778,
+ "learning_rate": 4.3006238063712725e-06,
+ "loss": 0.4647,
+ "step": 386
+ },
+ {
+ "epoch": 1.822884012539185,
+ "grad_norm": 0.5293509364128113,
+ "learning_rate": 4.295968515657583e-06,
+ "loss": 0.4998,
+ "step": 387
+ },
+ {
+ "epoch": 1.8275862068965516,
+ "grad_norm": 0.4775199294090271,
+ "learning_rate": 4.29130032039044e-06,
+ "loss": 0.4821,
+ "step": 388
+ },
+ {
+ "epoch": 1.8322884012539185,
+ "grad_norm": 0.4914006292819977,
+ "learning_rate": 4.2866192541121755e-06,
+ "loss": 0.4735,
+ "step": 389
+ },
+ {
+ "epoch": 1.8369905956112853,
+ "grad_norm": 0.5009908080101013,
+ "learning_rate": 4.281925350457606e-06,
+ "loss": 0.4741,
+ "step": 390
+ },
+ {
+ "epoch": 1.841692789968652,
+ "grad_norm": 0.47211164236068726,
+ "learning_rate": 4.277218643153787e-06,
+ "loss": 0.4786,
+ "step": 391
+ },
+ {
+ "epoch": 1.8463949843260188,
+ "grad_norm": 1.9644113779067993,
+ "learning_rate": 4.272499166019771e-06,
+ "loss": 0.4759,
+ "step": 392
+ },
+ {
+ "epoch": 1.8510971786833856,
+ "grad_norm": 0.535971999168396,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.4665,
+ "step": 393
+ },
+ {
+ "epoch": 1.8557993730407523,
+ "grad_norm": 0.4666787385940552,
+ "learning_rate": 4.2630220379959006e-06,
+ "loss": 0.4417,
+ "step": 394
+ },
+ {
+ "epoch": 1.8605015673981191,
+ "grad_norm": 0.5976264476776123,
+ "learning_rate": 4.258264455201953e-06,
+ "loss": 0.4665,
+ "step": 395
+ },
+ {
+ "epoch": 1.865203761755486,
+ "grad_norm": 0.4814331531524658,
+ "learning_rate": 4.2534942387691335e-06,
+ "loss": 0.4896,
+ "step": 396
+ },
+ {
+ "epoch": 1.8699059561128526,
+ "grad_norm": 0.4929859936237335,
+ "learning_rate": 4.248711422972829e-06,
+ "loss": 0.4765,
+ "step": 397
+ },
+ {
+ "epoch": 1.8746081504702194,
+ "grad_norm": 0.517914354801178,
+ "learning_rate": 4.243916042178954e-06,
+ "loss": 0.4601,
+ "step": 398
+ },
+ {
+ "epoch": 1.8793103448275863,
+ "grad_norm": 0.47731271386146545,
+ "learning_rate": 4.239108130843709e-06,
+ "loss": 0.469,
+ "step": 399
+ },
+ {
+ "epoch": 1.884012539184953,
+ "grad_norm": 0.4939954876899719,
+ "learning_rate": 4.234287723513326e-06,
+ "loss": 0.4929,
+ "step": 400
+ },
+ {
+ "epoch": 1.8887147335423198,
+ "grad_norm": 0.48573923110961914,
+ "learning_rate": 4.229454854823827e-06,
+ "loss": 0.4913,
+ "step": 401
+ },
+ {
+ "epoch": 1.8934169278996866,
+ "grad_norm": 0.5146409273147583,
+ "learning_rate": 4.224609559500772e-06,
+ "loss": 0.502,
+ "step": 402
+ },
+ {
+ "epoch": 1.8981191222570533,
+ "grad_norm": 0.4884675443172455,
+ "learning_rate": 4.21975187235901e-06,
+ "loss": 0.4541,
+ "step": 403
+ },
+ {
+ "epoch": 1.90282131661442,
+ "grad_norm": 0.4871810972690582,
+ "learning_rate": 4.21488182830243e-06,
+ "loss": 0.4811,
+ "step": 404
+ },
+ {
+ "epoch": 1.907523510971787,
+ "grad_norm": 0.5089552402496338,
+ "learning_rate": 4.209999462323706e-06,
+ "loss": 0.4584,
+ "step": 405
+ },
+ {
+ "epoch": 1.9122257053291536,
+ "grad_norm": 0.6191231608390808,
+ "learning_rate": 4.20510480950405e-06,
+ "loss": 0.4885,
+ "step": 406
+ },
+ {
+ "epoch": 1.9169278996865202,
+ "grad_norm": 0.5512096285820007,
+ "learning_rate": 4.200197905012961e-06,
+ "loss": 0.4529,
+ "step": 407
+ },
+ {
+ "epoch": 1.9216300940438873,
+ "grad_norm": 0.4743112027645111,
+ "learning_rate": 4.195278784107965e-06,
+ "loss": 0.4702,
+ "step": 408
+ },
+ {
+ "epoch": 1.926332288401254,
+ "grad_norm": 0.4635118544101715,
+ "learning_rate": 4.19034748213437e-06,
+ "loss": 0.4718,
+ "step": 409
+ },
+ {
+ "epoch": 1.9310344827586206,
+ "grad_norm": 0.48715919256210327,
+ "learning_rate": 4.185404034525008e-06,
+ "loss": 0.4638,
+ "step": 410
+ },
+ {
+ "epoch": 1.9357366771159876,
+ "grad_norm": 0.5373724102973938,
+ "learning_rate": 4.180448476799981e-06,
+ "loss": 0.5009,
+ "step": 411
+ },
+ {
+ "epoch": 1.9404388714733543,
+ "grad_norm": 0.4978715479373932,
+ "learning_rate": 4.175480844566404e-06,
+ "loss": 0.4726,
+ "step": 412
+ },
+ {
+ "epoch": 1.9451410658307209,
+ "grad_norm": 0.44817060232162476,
+ "learning_rate": 4.170501173518152e-06,
+ "loss": 0.4683,
+ "step": 413
+ },
+ {
+ "epoch": 1.9498432601880877,
+ "grad_norm": 0.48472973704338074,
+ "learning_rate": 4.165509499435604e-06,
+ "loss": 0.4662,
+ "step": 414
+ },
+ {
+ "epoch": 1.9545454545454546,
+ "grad_norm": 0.6567174792289734,
+ "learning_rate": 4.16050585818538e-06,
+ "loss": 0.4801,
+ "step": 415
+ },
+ {
+ "epoch": 1.9592476489028212,
+ "grad_norm": 0.5131425857543945,
+ "learning_rate": 4.155490285720092e-06,
+ "loss": 0.5036,
+ "step": 416
+ },
+ {
+ "epoch": 1.963949843260188,
+ "grad_norm": 0.46051982045173645,
+ "learning_rate": 4.150462818078079e-06,
+ "loss": 0.4911,
+ "step": 417
+ },
+ {
+ "epoch": 1.968652037617555,
+ "grad_norm": 0.5288883447647095,
+ "learning_rate": 4.145423491383153e-06,
+ "loss": 0.4871,
+ "step": 418
+ },
+ {
+ "epoch": 1.9733542319749215,
+ "grad_norm": 0.5143817663192749,
+ "learning_rate": 4.14037234184433e-06,
+ "loss": 0.5027,
+ "step": 419
+ },
+ {
+ "epoch": 1.9780564263322884,
+ "grad_norm": 0.46323707699775696,
+ "learning_rate": 4.135309405755583e-06,
+ "loss": 0.4876,
+ "step": 420
+ },
+ {
+ "epoch": 1.9827586206896552,
+ "grad_norm": 0.5239706039428711,
+ "learning_rate": 4.130234719495574e-06,
+ "loss": 0.4702,
+ "step": 421
+ },
+ {
+ "epoch": 1.9874608150470219,
+ "grad_norm": 0.538753867149353,
+ "learning_rate": 4.125148319527391e-06,
+ "loss": 0.4638,
+ "step": 422
+ },
+ {
+ "epoch": 1.9921630094043887,
+ "grad_norm": 0.5180181860923767,
+ "learning_rate": 4.1200502423982904e-06,
+ "loss": 0.4841,
+ "step": 423
+ },
+ {
+ "epoch": 1.9968652037617556,
+ "grad_norm": 0.6698167324066162,
+ "learning_rate": 4.1149405247394295e-06,
+ "loss": 0.4882,
+ "step": 424
+ },
+ {
+ "epoch": 2.0047021943573666,
+ "grad_norm": 0.9728522896766663,
+ "learning_rate": 4.10981920326561e-06,
+ "loss": 0.9125,
+ "step": 425
+ },
+ {
+ "epoch": 2.0094043887147337,
+ "grad_norm": 0.7356107831001282,
+ "learning_rate": 4.104686314775009e-06,
+ "loss": 0.4422,
+ "step": 426
+ },
+ {
+ "epoch": 2.0141065830721003,
+ "grad_norm": 0.44414228200912476,
+ "learning_rate": 4.099541896148914e-06,
+ "loss": 0.4511,
+ "step": 427
+ },
+ {
+ "epoch": 2.018808777429467,
+ "grad_norm": 0.5738011002540588,
+ "learning_rate": 4.094385984351462e-06,
+ "loss": 0.4457,
+ "step": 428
+ },
+ {
+ "epoch": 2.023510971786834,
+ "grad_norm": 0.4643106460571289,
+ "learning_rate": 4.0892186164293715e-06,
+ "loss": 0.4644,
+ "step": 429
+ },
+ {
+ "epoch": 2.0282131661442007,
+ "grad_norm": 0.5355309247970581,
+ "learning_rate": 4.0840398295116745e-06,
+ "loss": 0.4535,
+ "step": 430
+ },
+ {
+ "epoch": 2.0329153605015673,
+ "grad_norm": 0.512458324432373,
+ "learning_rate": 4.078849660809456e-06,
+ "loss": 0.4481,
+ "step": 431
+ },
+ {
+ "epoch": 2.0376175548589344,
+ "grad_norm": 0.5055253505706787,
+ "learning_rate": 4.073648147615579e-06,
+ "loss": 0.4309,
+ "step": 432
+ },
+ {
+ "epoch": 2.042319749216301,
+ "grad_norm": 0.5128353834152222,
+ "learning_rate": 4.068435327304421e-06,
+ "loss": 0.4562,
+ "step": 433
+ },
+ {
+ "epoch": 2.0470219435736676,
+ "grad_norm": 0.4432103633880615,
+ "learning_rate": 4.063211237331603e-06,
+ "loss": 0.4535,
+ "step": 434
+ },
+ {
+ "epoch": 2.0517241379310347,
+ "grad_norm": 0.5092498660087585,
+ "learning_rate": 4.057975915233725e-06,
+ "loss": 0.4385,
+ "step": 435
+ },
+ {
+ "epoch": 2.0564263322884013,
+ "grad_norm": 0.4798133671283722,
+ "learning_rate": 4.052729398628089e-06,
+ "loss": 0.466,
+ "step": 436
+ },
+ {
+ "epoch": 2.061128526645768,
+ "grad_norm": 0.5094019770622253,
+ "learning_rate": 4.047471725212437e-06,
+ "loss": 0.4624,
+ "step": 437
+ },
+ {
+ "epoch": 2.0658307210031346,
+ "grad_norm": 0.5814178586006165,
+ "learning_rate": 4.042202932764673e-06,
+ "loss": 0.4472,
+ "step": 438
+ },
+ {
+ "epoch": 2.0705329153605017,
+ "grad_norm": 0.503394365310669,
+ "learning_rate": 4.036923059142595e-06,
+ "loss": 0.4481,
+ "step": 439
+ },
+ {
+ "epoch": 2.0752351097178683,
+ "grad_norm": 0.5108861923217773,
+ "learning_rate": 4.031632142283623e-06,
+ "loss": 0.4416,
+ "step": 440
+ },
+ {
+ "epoch": 2.079937304075235,
+ "grad_norm": 0.5303971171379089,
+ "learning_rate": 4.026330220204524e-06,
+ "loss": 0.4515,
+ "step": 441
+ },
+ {
+ "epoch": 2.084639498432602,
+ "grad_norm": 0.45014286041259766,
+ "learning_rate": 4.021017331001146e-06,
+ "loss": 0.441,
+ "step": 442
+ },
+ {
+ "epoch": 2.0893416927899686,
+ "grad_norm": 0.5371219515800476,
+ "learning_rate": 4.015693512848131e-06,
+ "loss": 0.4471,
+ "step": 443
+ },
+ {
+ "epoch": 2.0940438871473352,
+ "grad_norm": 0.5105510354042053,
+ "learning_rate": 4.0103588039986556e-06,
+ "loss": 0.4534,
+ "step": 444
+ },
+ {
+ "epoch": 2.0987460815047023,
+ "grad_norm": 0.4960611164569855,
+ "learning_rate": 4.005013242784146e-06,
+ "loss": 0.46,
+ "step": 445
+ },
+ {
+ "epoch": 2.103448275862069,
+ "grad_norm": 0.500354528427124,
+ "learning_rate": 3.999656867614006e-06,
+ "loss": 0.45,
+ "step": 446
+ },
+ {
+ "epoch": 2.1081504702194356,
+ "grad_norm": 0.4733876585960388,
+ "learning_rate": 3.994289716975341e-06,
+ "loss": 0.4644,
+ "step": 447
+ },
+ {
+ "epoch": 2.1128526645768027,
+ "grad_norm": 0.5002915263175964,
+ "learning_rate": 3.988911829432682e-06,
+ "loss": 0.4493,
+ "step": 448
+ },
+ {
+ "epoch": 2.1175548589341693,
+ "grad_norm": 0.48520293831825256,
+ "learning_rate": 3.983523243627706e-06,
+ "loss": 0.4458,
+ "step": 449
+ },
+ {
+ "epoch": 2.122257053291536,
+ "grad_norm": 0.6339934468269348,
+ "learning_rate": 3.978123998278962e-06,
+ "loss": 0.4352,
+ "step": 450
+ },
+ {
+ "epoch": 2.126959247648903,
+ "grad_norm": 1.172338843345642,
+ "learning_rate": 3.97271413218159e-06,
+ "loss": 0.4664,
+ "step": 451
+ },
+ {
+ "epoch": 2.1316614420062696,
+ "grad_norm": 0.47842296957969666,
+ "learning_rate": 3.9672936842070425e-06,
+ "loss": 0.4604,
+ "step": 452
+ },
+ {
+ "epoch": 2.1363636363636362,
+ "grad_norm": 0.506851077079773,
+ "learning_rate": 3.9618626933028086e-06,
+ "loss": 0.4674,
+ "step": 453
+ },
+ {
+ "epoch": 2.1410658307210033,
+ "grad_norm": 0.4922677278518677,
+ "learning_rate": 3.956421198492128e-06,
+ "loss": 0.4476,
+ "step": 454
+ },
+ {
+ "epoch": 2.14576802507837,
+ "grad_norm": 0.5307339429855347,
+ "learning_rate": 3.950969238873714e-06,
+ "loss": 0.4463,
+ "step": 455
+ },
+ {
+ "epoch": 2.1504702194357366,
+ "grad_norm": 0.5131121873855591,
+ "learning_rate": 3.9455068536214765e-06,
+ "loss": 0.4779,
+ "step": 456
+ },
+ {
+ "epoch": 2.1551724137931036,
+ "grad_norm": 0.5438089966773987,
+ "learning_rate": 3.9400340819842335e-06,
+ "loss": 0.4563,
+ "step": 457
+ },
+ {
+ "epoch": 2.1598746081504703,
+ "grad_norm": 0.7426711916923523,
+ "learning_rate": 3.934550963285432e-06,
+ "loss": 0.4561,
+ "step": 458
+ },
+ {
+ "epoch": 2.164576802507837,
+ "grad_norm": 0.482920378446579,
+ "learning_rate": 3.9290575369228664e-06,
+ "loss": 0.4293,
+ "step": 459
+ },
+ {
+ "epoch": 2.169278996865204,
+ "grad_norm": 0.6583715081214905,
+ "learning_rate": 3.923553842368396e-06,
+ "loss": 0.4682,
+ "step": 460
+ },
+ {
+ "epoch": 2.1739811912225706,
+ "grad_norm": 0.47901806235313416,
+ "learning_rate": 3.918039919167658e-06,
+ "loss": 0.4342,
+ "step": 461
+ },
+ {
+ "epoch": 2.1786833855799372,
+ "grad_norm": 0.4929746389389038,
+ "learning_rate": 3.912515806939786e-06,
+ "loss": 0.4478,
+ "step": 462
+ },
+ {
+ "epoch": 2.183385579937304,
+ "grad_norm": 0.48205333948135376,
+ "learning_rate": 3.906981545377124e-06,
+ "loss": 0.4595,
+ "step": 463
+ },
+ {
+ "epoch": 2.188087774294671,
+ "grad_norm": 0.5059337019920349,
+ "learning_rate": 3.901437174244943e-06,
+ "loss": 0.4294,
+ "step": 464
+ },
+ {
+ "epoch": 2.1927899686520376,
+ "grad_norm": 0.4752981662750244,
+ "learning_rate": 3.895882733381154e-06,
+ "loss": 0.448,
+ "step": 465
+ },
+ {
+ "epoch": 2.197492163009404,
+ "grad_norm": 0.5249196290969849,
+ "learning_rate": 3.890318262696023e-06,
+ "loss": 0.4655,
+ "step": 466
+ },
+ {
+ "epoch": 2.2021943573667713,
+ "grad_norm": 0.48044726252555847,
+ "learning_rate": 3.8847438021718805e-06,
+ "loss": 0.4413,
+ "step": 467
+ },
+ {
+ "epoch": 2.206896551724138,
+ "grad_norm": 0.84516841173172,
+ "learning_rate": 3.879159391862839e-06,
+ "loss": 0.4645,
+ "step": 468
+ },
+ {
+ "epoch": 2.2115987460815045,
+ "grad_norm": 0.5334392786026001,
+ "learning_rate": 3.873565071894503e-06,
+ "loss": 0.4347,
+ "step": 469
+ },
+ {
+ "epoch": 2.2163009404388716,
+ "grad_norm": 0.5113687515258789,
+ "learning_rate": 3.86796088246368e-06,
+ "loss": 0.4314,
+ "step": 470
+ },
+ {
+ "epoch": 2.2210031347962382,
+ "grad_norm": 0.5226101279258728,
+ "learning_rate": 3.8623468638380905e-06,
+ "loss": 0.418,
+ "step": 471
+ },
+ {
+ "epoch": 2.225705329153605,
+ "grad_norm": 0.4901522099971771,
+ "learning_rate": 3.856723056356085e-06,
+ "loss": 0.4597,
+ "step": 472
+ },
+ {
+ "epoch": 2.230407523510972,
+ "grad_norm": 0.5312012434005737,
+ "learning_rate": 3.851089500426346e-06,
+ "loss": 0.4444,
+ "step": 473
+ },
+ {
+ "epoch": 2.2351097178683386,
+ "grad_norm": 0.5347906351089478,
+ "learning_rate": 3.845446236527605e-06,
+ "loss": 0.4447,
+ "step": 474
+ },
+ {
+ "epoch": 2.239811912225705,
+ "grad_norm": 0.4781494438648224,
+ "learning_rate": 3.8397933052083445e-06,
+ "loss": 0.462,
+ "step": 475
+ },
+ {
+ "epoch": 2.2445141065830723,
+ "grad_norm": 0.5215012431144714,
+ "learning_rate": 3.834130747086512e-06,
+ "loss": 0.4475,
+ "step": 476
+ },
+ {
+ "epoch": 2.249216300940439,
+ "grad_norm": 0.5048666000366211,
+ "learning_rate": 3.828458602849226e-06,
+ "loss": 0.4483,
+ "step": 477
+ },
+ {
+ "epoch": 2.2539184952978055,
+ "grad_norm": 0.5508173108100891,
+ "learning_rate": 3.822776913252485e-06,
+ "loss": 0.4511,
+ "step": 478
+ },
+ {
+ "epoch": 2.2586206896551726,
+ "grad_norm": 0.5031043887138367,
+ "learning_rate": 3.817085719120872e-06,
+ "loss": 0.4019,
+ "step": 479
+ },
+ {
+ "epoch": 2.2633228840125392,
+ "grad_norm": 0.508939802646637,
+ "learning_rate": 3.811385061347263e-06,
+ "loss": 0.4461,
+ "step": 480
+ },
+ {
+ "epoch": 2.268025078369906,
+ "grad_norm": 0.5605170726776123,
+ "learning_rate": 3.805674980892535e-06,
+ "loss": 0.4695,
+ "step": 481
+ },
+ {
+ "epoch": 2.2727272727272725,
+ "grad_norm": 0.5526806712150574,
+ "learning_rate": 3.7999555187852667e-06,
+ "loss": 0.4575,
+ "step": 482
+ },
+ {
+ "epoch": 2.2774294670846396,
+ "grad_norm": 0.47659724950790405,
+ "learning_rate": 3.7942267161214497e-06,
+ "loss": 0.4433,
+ "step": 483
+ },
+ {
+ "epoch": 2.282131661442006,
+ "grad_norm": 0.49713975191116333,
+ "learning_rate": 3.7884886140641884e-06,
+ "loss": 0.4692,
+ "step": 484
+ },
+ {
+ "epoch": 2.2868338557993733,
+ "grad_norm": 0.48685988783836365,
+ "learning_rate": 3.7827412538434062e-06,
+ "loss": 0.4328,
+ "step": 485
+ },
+ {
+ "epoch": 2.29153605015674,
+ "grad_norm": 0.5074832439422607,
+ "learning_rate": 3.7769846767555495e-06,
+ "loss": 0.4598,
+ "step": 486
+ },
+ {
+ "epoch": 2.2962382445141065,
+ "grad_norm": 0.5333994030952454,
+ "learning_rate": 3.7712189241632898e-06,
+ "loss": 0.4554,
+ "step": 487
+ },
+ {
+ "epoch": 2.300940438871473,
+ "grad_norm": 0.49985551834106445,
+ "learning_rate": 3.7654440374952288e-06,
+ "loss": 0.4421,
+ "step": 488
+ },
+ {
+ "epoch": 2.30564263322884,
+ "grad_norm": 0.4791257679462433,
+ "learning_rate": 3.7596600582455976e-06,
+ "loss": 0.4187,
+ "step": 489
+ },
+ {
+ "epoch": 2.310344827586207,
+ "grad_norm": 0.4951220154762268,
+ "learning_rate": 3.75386702797396e-06,
+ "loss": 0.4205,
+ "step": 490
+ },
+ {
+ "epoch": 2.3150470219435735,
+ "grad_norm": 0.4765990674495697,
+ "learning_rate": 3.7480649883049164e-06,
+ "loss": 0.4251,
+ "step": 491
+ },
+ {
+ "epoch": 2.3197492163009406,
+ "grad_norm": 0.5125405192375183,
+ "learning_rate": 3.7422539809277993e-06,
+ "loss": 0.4361,
+ "step": 492
+ },
+ {
+ "epoch": 2.324451410658307,
+ "grad_norm": 0.5286112427711487,
+ "learning_rate": 3.736434047596379e-06,
+ "loss": 0.4423,
+ "step": 493
+ },
+ {
+ "epoch": 2.329153605015674,
+ "grad_norm": 0.47961002588272095,
+ "learning_rate": 3.73060523012856e-06,
+ "loss": 0.453,
+ "step": 494
+ },
+ {
+ "epoch": 2.333855799373041,
+ "grad_norm": 0.5857998728752136,
+ "learning_rate": 3.724767570406082e-06,
+ "loss": 0.4674,
+ "step": 495
+ },
+ {
+ "epoch": 2.3385579937304075,
+ "grad_norm": 0.5348326563835144,
+ "learning_rate": 3.7189211103742206e-06,
+ "loss": 0.4267,
+ "step": 496
+ },
+ {
+ "epoch": 2.343260188087774,
+ "grad_norm": 0.4718475937843323,
+ "learning_rate": 3.7130658920414818e-06,
+ "loss": 0.4619,
+ "step": 497
+ },
+ {
+ "epoch": 2.347962382445141,
+ "grad_norm": 0.44225215911865234,
+ "learning_rate": 3.7072019574793034e-06,
+ "loss": 0.4712,
+ "step": 498
+ },
+ {
+ "epoch": 2.352664576802508,
+ "grad_norm": 0.48492008447647095,
+ "learning_rate": 3.701329348821752e-06,
+ "loss": 0.4521,
+ "step": 499
+ },
+ {
+ "epoch": 2.3573667711598745,
+ "grad_norm": 0.49741214513778687,
+ "learning_rate": 3.695448108265221e-06,
+ "loss": 0.4378,
+ "step": 500
+ },
+ {
+ "epoch": 2.3620689655172415,
+ "grad_norm": 0.5086454749107361,
+ "learning_rate": 3.6895582780681254e-06,
+ "loss": 0.4349,
+ "step": 501
+ },
+ {
+ "epoch": 2.366771159874608,
+ "grad_norm": 0.49111631512641907,
+ "learning_rate": 3.683659900550598e-06,
+ "loss": 0.4625,
+ "step": 502
+ },
+ {
+ "epoch": 2.371473354231975,
+ "grad_norm": 0.5006322264671326,
+ "learning_rate": 3.6777530180941894e-06,
+ "loss": 0.4457,
+ "step": 503
+ },
+ {
+ "epoch": 2.376175548589342,
+ "grad_norm": 0.5934097170829773,
+ "learning_rate": 3.671837673141559e-06,
+ "loss": 0.4306,
+ "step": 504
+ },
+ {
+ "epoch": 2.3808777429467085,
+ "grad_norm": 0.626039981842041,
+ "learning_rate": 3.6659139081961707e-06,
+ "loss": 0.4464,
+ "step": 505
+ },
+ {
+ "epoch": 2.385579937304075,
+ "grad_norm": 0.4751131236553192,
+ "learning_rate": 3.6599817658219916e-06,
+ "loss": 0.4508,
+ "step": 506
+ },
+ {
+ "epoch": 2.3902821316614418,
+ "grad_norm": 1.4542276859283447,
+ "learning_rate": 3.6540412886431796e-06,
+ "loss": 0.4606,
+ "step": 507
+ },
+ {
+ "epoch": 2.394984326018809,
+ "grad_norm": 0.5189768075942993,
+ "learning_rate": 3.648092519343783e-06,
+ "loss": 0.4435,
+ "step": 508
+ },
+ {
+ "epoch": 2.3996865203761755,
+ "grad_norm": 1.4583938121795654,
+ "learning_rate": 3.642135500667431e-06,
+ "loss": 0.4314,
+ "step": 509
+ },
+ {
+ "epoch": 2.4043887147335425,
+ "grad_norm": 0.5038107633590698,
+ "learning_rate": 3.6361702754170247e-06,
+ "loss": 0.4463,
+ "step": 510
+ },
+ {
+ "epoch": 2.409090909090909,
+ "grad_norm": 0.5786447525024414,
+ "learning_rate": 3.630196886454435e-06,
+ "loss": 0.4281,
+ "step": 511
+ },
+ {
+ "epoch": 2.413793103448276,
+ "grad_norm": 0.48684218525886536,
+ "learning_rate": 3.62421537670019e-06,
+ "loss": 0.4432,
+ "step": 512
+ },
+ {
+ "epoch": 2.4184952978056424,
+ "grad_norm": 0.5117013454437256,
+ "learning_rate": 3.618225789133167e-06,
+ "loss": 0.4464,
+ "step": 513
+ },
+ {
+ "epoch": 2.4231974921630095,
+ "grad_norm": 0.49249181151390076,
+ "learning_rate": 3.612228166790287e-06,
+ "loss": 0.4465,
+ "step": 514
+ },
+ {
+ "epoch": 2.427899686520376,
+ "grad_norm": 0.5761134624481201,
+ "learning_rate": 3.606222552766201e-06,
+ "loss": 0.4539,
+ "step": 515
+ },
+ {
+ "epoch": 2.4326018808777428,
+ "grad_norm": 0.4839339256286621,
+ "learning_rate": 3.6002089902129844e-06,
+ "loss": 0.4469,
+ "step": 516
+ },
+ {
+ "epoch": 2.43730407523511,
+ "grad_norm": 0.4765976369380951,
+ "learning_rate": 3.5941875223398225e-06,
+ "loss": 0.4379,
+ "step": 517
+ },
+ {
+ "epoch": 2.4420062695924765,
+ "grad_norm": 0.5239338874816895,
+ "learning_rate": 3.588158192412707e-06,
+ "loss": 0.4354,
+ "step": 518
+ },
+ {
+ "epoch": 2.446708463949843,
+ "grad_norm": 0.48244595527648926,
+ "learning_rate": 3.582121043754116e-06,
+ "loss": 0.438,
+ "step": 519
+ },
+ {
+ "epoch": 2.45141065830721,
+ "grad_norm": 0.4641244411468506,
+ "learning_rate": 3.5760761197427097e-06,
+ "loss": 0.438,
+ "step": 520
+ },
+ {
+ "epoch": 2.456112852664577,
+ "grad_norm": 0.48468074202537537,
+ "learning_rate": 3.570023463813017e-06,
+ "loss": 0.4306,
+ "step": 521
+ },
+ {
+ "epoch": 2.4608150470219434,
+ "grad_norm": 0.48626402020454407,
+ "learning_rate": 3.5639631194551216e-06,
+ "loss": 0.4531,
+ "step": 522
+ },
+ {
+ "epoch": 2.4655172413793105,
+ "grad_norm": 0.5581764578819275,
+ "learning_rate": 3.557895130214352e-06,
+ "loss": 0.4451,
+ "step": 523
+ },
+ {
+ "epoch": 2.470219435736677,
+ "grad_norm": 0.6739279627799988,
+ "learning_rate": 3.5518195396909653e-06,
+ "loss": 0.4636,
+ "step": 524
+ },
+ {
+ "epoch": 2.4749216300940438,
+ "grad_norm": 0.550710916519165,
+ "learning_rate": 3.5457363915398384e-06,
+ "loss": 0.4513,
+ "step": 525
+ },
+ {
+ "epoch": 2.479623824451411,
+ "grad_norm": 0.479632705450058,
+ "learning_rate": 3.539645729470151e-06,
+ "loss": 0.4387,
+ "step": 526
+ },
+ {
+ "epoch": 2.4843260188087775,
+ "grad_norm": 0.48741331696510315,
+ "learning_rate": 3.5335475972450715e-06,
+ "loss": 0.4388,
+ "step": 527
+ },
+ {
+ "epoch": 2.489028213166144,
+ "grad_norm": 0.4964964985847473,
+ "learning_rate": 3.5274420386814458e-06,
+ "loss": 0.4643,
+ "step": 528
+ },
+ {
+ "epoch": 2.493730407523511,
+ "grad_norm": 0.5134934186935425,
+ "learning_rate": 3.521329097649478e-06,
+ "loss": 0.4454,
+ "step": 529
+ },
+ {
+ "epoch": 2.498432601880878,
+ "grad_norm": 0.4962058961391449,
+ "learning_rate": 3.515208818072418e-06,
+ "loss": 0.4408,
+ "step": 530
+ },
+ {
+ "epoch": 2.5031347962382444,
+ "grad_norm": 0.5611489415168762,
+ "learning_rate": 3.509081243926247e-06,
+ "loss": 0.4306,
+ "step": 531
+ },
+ {
+ "epoch": 2.507836990595611,
+ "grad_norm": 0.7012472748756409,
+ "learning_rate": 3.5029464192393557e-06,
+ "loss": 0.4614,
+ "step": 532
+ },
+ {
+ "epoch": 2.512539184952978,
+ "grad_norm": 0.5351004004478455,
+ "learning_rate": 3.4968043880922363e-06,
+ "loss": 0.4151,
+ "step": 533
+ },
+ {
+ "epoch": 2.5172413793103448,
+ "grad_norm": 0.5087808966636658,
+ "learning_rate": 3.4906551946171603e-06,
+ "loss": 0.4242,
+ "step": 534
+ },
+ {
+ "epoch": 2.521943573667712,
+ "grad_norm": 0.5459093451499939,
+ "learning_rate": 3.484498882997861e-06,
+ "loss": 0.4215,
+ "step": 535
+ },
+ {
+ "epoch": 2.5266457680250785,
+ "grad_norm": 0.49804285168647766,
+ "learning_rate": 3.478335497469219e-06,
+ "loss": 0.4492,
+ "step": 536
+ },
+ {
+ "epoch": 2.531347962382445,
+ "grad_norm": 0.4959704875946045,
+ "learning_rate": 3.472165082316943e-06,
+ "loss": 0.4511,
+ "step": 537
+ },
+ {
+ "epoch": 2.5360501567398117,
+ "grad_norm": 0.5059382319450378,
+ "learning_rate": 3.465987681877251e-06,
+ "loss": 0.4419,
+ "step": 538
+ },
+ {
+ "epoch": 2.540752351097179,
+ "grad_norm": 0.7398380637168884,
+ "learning_rate": 3.4598033405365527e-06,
+ "loss": 0.4548,
+ "step": 539
+ },
+ {
+ "epoch": 2.5454545454545454,
+ "grad_norm": 0.5326687693595886,
+ "learning_rate": 3.45361210273113e-06,
+ "loss": 0.4473,
+ "step": 540
+ },
+ {
+ "epoch": 2.5501567398119125,
+ "grad_norm": 0.5069761872291565,
+ "learning_rate": 3.447414012946818e-06,
+ "loss": 0.4343,
+ "step": 541
+ },
+ {
+ "epoch": 2.554858934169279,
+ "grad_norm": 0.45915964245796204,
+ "learning_rate": 3.4412091157186853e-06,
+ "loss": 0.4499,
+ "step": 542
+ },
+ {
+ "epoch": 2.5595611285266457,
+ "grad_norm": 0.5174360275268555,
+ "learning_rate": 3.4349974556307146e-06,
+ "loss": 0.44,
+ "step": 543
+ },
+ {
+ "epoch": 2.5642633228840124,
+ "grad_norm": 0.5008105039596558,
+ "learning_rate": 3.4287790773154807e-06,
+ "loss": 0.4648,
+ "step": 544
+ },
+ {
+ "epoch": 2.5689655172413794,
+ "grad_norm": 0.5628801584243774,
+ "learning_rate": 3.4225540254538297e-06,
+ "loss": 0.462,
+ "step": 545
+ },
+ {
+ "epoch": 2.573667711598746,
+ "grad_norm": 0.9913654923439026,
+ "learning_rate": 3.416322344774562e-06,
+ "loss": 0.4403,
+ "step": 546
+ },
+ {
+ "epoch": 2.5783699059561127,
+ "grad_norm": 0.5034172534942627,
+ "learning_rate": 3.4100840800541055e-06,
+ "loss": 0.4622,
+ "step": 547
+ },
+ {
+ "epoch": 2.58307210031348,
+ "grad_norm": 0.495516836643219,
+ "learning_rate": 3.4038392761161986e-06,
+ "loss": 0.4523,
+ "step": 548
+ },
+ {
+ "epoch": 2.5877742946708464,
+ "grad_norm": 0.48142367601394653,
+ "learning_rate": 3.3975879778315634e-06,
+ "loss": 0.4242,
+ "step": 549
+ },
+ {
+ "epoch": 2.592476489028213,
+ "grad_norm": 0.4635900557041168,
+ "learning_rate": 3.391330230117587e-06,
+ "loss": 0.3949,
+ "step": 550
+ },
+ {
+ "epoch": 2.5971786833855797,
+ "grad_norm": 0.4769044816493988,
+ "learning_rate": 3.385066077937997e-06,
+ "loss": 0.4651,
+ "step": 551
+ },
+ {
+ "epoch": 2.6018808777429467,
+ "grad_norm": 1.059553861618042,
+ "learning_rate": 3.378795566302541e-06,
+ "loss": 0.4243,
+ "step": 552
+ },
+ {
+ "epoch": 2.6065830721003134,
+ "grad_norm": 0.512134850025177,
+ "learning_rate": 3.372518740266658e-06,
+ "loss": 0.4435,
+ "step": 553
+ },
+ {
+ "epoch": 2.6112852664576804,
+ "grad_norm": 0.5267173647880554,
+ "learning_rate": 3.36623564493116e-06,
+ "loss": 0.4558,
+ "step": 554
+ },
+ {
+ "epoch": 2.615987460815047,
+ "grad_norm": 0.49343907833099365,
+ "learning_rate": 3.3599463254419047e-06,
+ "loss": 0.4598,
+ "step": 555
+ },
+ {
+ "epoch": 2.6206896551724137,
+ "grad_norm": 0.5496839284896851,
+ "learning_rate": 3.3536508269894724e-06,
+ "loss": 0.4669,
+ "step": 556
+ },
+ {
+ "epoch": 2.6253918495297803,
+ "grad_norm": 0.5957831740379333,
+ "learning_rate": 3.347349194808842e-06,
+ "loss": 0.4533,
+ "step": 557
+ },
+ {
+ "epoch": 2.6300940438871474,
+ "grad_norm": 0.5049230456352234,
+ "learning_rate": 3.3410414741790625e-06,
+ "loss": 0.4293,
+ "step": 558
+ },
+ {
+ "epoch": 2.634796238244514,
+ "grad_norm": 0.5167728066444397,
+ "learning_rate": 3.3347277104229332e-06,
+ "loss": 0.443,
+ "step": 559
+ },
+ {
+ "epoch": 2.639498432601881,
+ "grad_norm": 0.6090758442878723,
+ "learning_rate": 3.3284079489066728e-06,
+ "loss": 0.4378,
+ "step": 560
+ },
+ {
+ "epoch": 2.6442006269592477,
+ "grad_norm": 0.5165027379989624,
+ "learning_rate": 3.3220822350395966e-06,
+ "loss": 0.4302,
+ "step": 561
+ },
+ {
+ "epoch": 2.6489028213166144,
+ "grad_norm": 0.5152680277824402,
+ "learning_rate": 3.31575061427379e-06,
+ "loss": 0.4311,
+ "step": 562
+ },
+ {
+ "epoch": 2.653605015673981,
+ "grad_norm": 0.547235906124115,
+ "learning_rate": 3.3094131321037783e-06,
+ "loss": 0.4371,
+ "step": 563
+ },
+ {
+ "epoch": 2.658307210031348,
+ "grad_norm": 0.521981418132782,
+ "learning_rate": 3.303069834066206e-06,
+ "loss": 0.4346,
+ "step": 564
+ },
+ {
+ "epoch": 2.6630094043887147,
+ "grad_norm": 0.5127217769622803,
+ "learning_rate": 3.2967207657395055e-06,
+ "loss": 0.474,
+ "step": 565
+ },
+ {
+ "epoch": 2.6677115987460818,
+ "grad_norm": 0.5210872888565063,
+ "learning_rate": 3.2903659727435692e-06,
+ "loss": 0.4622,
+ "step": 566
+ },
+ {
+ "epoch": 2.6724137931034484,
+ "grad_norm": 0.5768873691558838,
+ "learning_rate": 3.284005500739423e-06,
+ "loss": 0.4556,
+ "step": 567
+ },
+ {
+ "epoch": 2.677115987460815,
+ "grad_norm": 0.5305764675140381,
+ "learning_rate": 3.2776393954289e-06,
+ "loss": 0.429,
+ "step": 568
+ },
+ {
+ "epoch": 2.6818181818181817,
+ "grad_norm": 0.5312129855155945,
+ "learning_rate": 3.271267702554307e-06,
+ "loss": 0.4208,
+ "step": 569
+ },
+ {
+ "epoch": 2.6865203761755487,
+ "grad_norm": 0.5433884859085083,
+ "learning_rate": 3.2648904678981032e-06,
+ "loss": 0.4647,
+ "step": 570
+ },
+ {
+ "epoch": 2.6912225705329154,
+ "grad_norm": 1.2331725358963013,
+ "learning_rate": 3.2585077372825636e-06,
+ "loss": 0.4126,
+ "step": 571
+ },
+ {
+ "epoch": 2.695924764890282,
+ "grad_norm": 0.5495198369026184,
+ "learning_rate": 3.2521195565694543e-06,
+ "loss": 0.4453,
+ "step": 572
+ },
+ {
+ "epoch": 2.700626959247649,
+ "grad_norm": 0.5230907201766968,
+ "learning_rate": 3.2457259716597023e-06,
+ "loss": 0.446,
+ "step": 573
+ },
+ {
+ "epoch": 2.7053291536050157,
+ "grad_norm": 0.4807503819465637,
+ "learning_rate": 3.2393270284930658e-06,
+ "loss": 0.4547,
+ "step": 574
+ },
+ {
+ "epoch": 2.7100313479623823,
+ "grad_norm": 0.5169614553451538,
+ "learning_rate": 3.2329227730478026e-06,
+ "loss": 0.4319,
+ "step": 575
+ },
+ {
+ "epoch": 2.714733542319749,
+ "grad_norm": 0.502966046333313,
+ "learning_rate": 3.2265132513403415e-06,
+ "loss": 0.4196,
+ "step": 576
+ },
+ {
+ "epoch": 2.719435736677116,
+ "grad_norm": 0.5387672781944275,
+ "learning_rate": 3.22009850942495e-06,
+ "loss": 0.4449,
+ "step": 577
+ },
+ {
+ "epoch": 2.7241379310344827,
+ "grad_norm": 0.5503709316253662,
+ "learning_rate": 3.213678593393405e-06,
+ "loss": 0.4589,
+ "step": 578
+ },
+ {
+ "epoch": 2.7288401253918497,
+ "grad_norm": 0.5165039300918579,
+ "learning_rate": 3.207253549374662e-06,
+ "loss": 0.4578,
+ "step": 579
+ },
+ {
+ "epoch": 2.7335423197492164,
+ "grad_norm": 0.5894023180007935,
+ "learning_rate": 3.200823423534519e-06,
+ "loss": 0.4448,
+ "step": 580
+ },
+ {
+ "epoch": 2.738244514106583,
+ "grad_norm": 0.5234156250953674,
+ "learning_rate": 3.194388262075293e-06,
+ "loss": 0.4504,
+ "step": 581
+ },
+ {
+ "epoch": 2.7429467084639496,
+ "grad_norm": 0.47498077154159546,
+ "learning_rate": 3.1879481112354804e-06,
+ "loss": 0.4471,
+ "step": 582
+ },
+ {
+ "epoch": 2.7476489028213167,
+ "grad_norm": 0.5213322043418884,
+ "learning_rate": 3.181503017289428e-06,
+ "loss": 0.4096,
+ "step": 583
+ },
+ {
+ "epoch": 2.7523510971786833,
+ "grad_norm": 0.5031464695930481,
+ "learning_rate": 3.175053026547002e-06,
+ "loss": 0.416,
+ "step": 584
+ },
+ {
+ "epoch": 2.7570532915360504,
+ "grad_norm": 0.7983574867248535,
+ "learning_rate": 3.16859818535325e-06,
+ "loss": 0.457,
+ "step": 585
+ },
+ {
+ "epoch": 2.761755485893417,
+ "grad_norm": 0.47774994373321533,
+ "learning_rate": 3.1621385400880756e-06,
+ "loss": 0.4529,
+ "step": 586
+ },
+ {
+ "epoch": 2.7664576802507836,
+ "grad_norm": 0.8216882348060608,
+ "learning_rate": 3.1556741371658984e-06,
+ "loss": 0.4559,
+ "step": 587
+ },
+ {
+ "epoch": 2.7711598746081503,
+ "grad_norm": 0.5124049186706543,
+ "learning_rate": 3.1492050230353238e-06,
+ "loss": 0.4438,
+ "step": 588
+ },
+ {
+ "epoch": 2.7758620689655173,
+ "grad_norm": 0.5410915017127991,
+ "learning_rate": 3.142731244178809e-06,
+ "loss": 0.4195,
+ "step": 589
+ },
+ {
+ "epoch": 2.780564263322884,
+ "grad_norm": 0.5318175554275513,
+ "learning_rate": 3.1362528471123277e-06,
+ "loss": 0.4046,
+ "step": 590
+ },
+ {
+ "epoch": 2.785266457680251,
+ "grad_norm": 0.6133676171302795,
+ "learning_rate": 3.129769878385039e-06,
+ "loss": 0.4098,
+ "step": 591
+ },
+ {
+ "epoch": 2.7899686520376177,
+ "grad_norm": 0.4698888063430786,
+ "learning_rate": 3.1232823845789473e-06,
+ "loss": 0.4508,
+ "step": 592
+ },
+ {
+ "epoch": 2.7946708463949843,
+ "grad_norm": 0.6980767250061035,
+ "learning_rate": 3.1167904123085736e-06,
+ "loss": 0.455,
+ "step": 593
+ },
+ {
+ "epoch": 2.799373040752351,
+ "grad_norm": 0.5151284337043762,
+ "learning_rate": 3.110294008220617e-06,
+ "loss": 0.4431,
+ "step": 594
+ },
+ {
+ "epoch": 2.804075235109718,
+ "grad_norm": 0.47901320457458496,
+ "learning_rate": 3.1037932189936205e-06,
+ "loss": 0.4406,
+ "step": 595
+ },
+ {
+ "epoch": 2.8087774294670846,
+ "grad_norm": 0.5079891085624695,
+ "learning_rate": 3.097288091337635e-06,
+ "loss": 0.4351,
+ "step": 596
+ },
+ {
+ "epoch": 2.8134796238244513,
+ "grad_norm": 0.5278874635696411,
+ "learning_rate": 3.0907786719938876e-06,
+ "loss": 0.4264,
+ "step": 597
+ },
+ {
+ "epoch": 2.8181818181818183,
+ "grad_norm": 0.47123396396636963,
+ "learning_rate": 3.084265007734436e-06,
+ "loss": 0.434,
+ "step": 598
+ },
+ {
+ "epoch": 2.822884012539185,
+ "grad_norm": 0.5229635834693909,
+ "learning_rate": 3.0777471453618457e-06,
+ "loss": 0.4602,
+ "step": 599
+ },
+ {
+ "epoch": 2.8275862068965516,
+ "grad_norm": 0.47847074270248413,
+ "learning_rate": 3.0712251317088426e-06,
+ "loss": 0.4317,
+ "step": 600
+ },
+ {
+ "epoch": 2.8322884012539182,
+ "grad_norm": 0.7754543423652649,
+ "learning_rate": 3.064699013637983e-06,
+ "loss": 0.4528,
+ "step": 601
+ },
+ {
+ "epoch": 2.8369905956112853,
+ "grad_norm": 0.5581084489822388,
+ "learning_rate": 3.0581688380413115e-06,
+ "loss": 0.4369,
+ "step": 602
+ },
+ {
+ "epoch": 2.841692789968652,
+ "grad_norm": 0.588622510433197,
+ "learning_rate": 3.0516346518400315e-06,
+ "loss": 0.4517,
+ "step": 603
+ },
+ {
+ "epoch": 2.846394984326019,
+ "grad_norm": 0.565423846244812,
+ "learning_rate": 3.0450965019841593e-06,
+ "loss": 0.4517,
+ "step": 604
+ },
+ {
+ "epoch": 2.8510971786833856,
+ "grad_norm": 0.47801777720451355,
+ "learning_rate": 3.0385544354521957e-06,
+ "loss": 0.4161,
+ "step": 605
+ },
+ {
+ "epoch": 2.8557993730407523,
+ "grad_norm": 0.5034862756729126,
+ "learning_rate": 3.0320084992507814e-06,
+ "loss": 0.4428,
+ "step": 606
+ },
+ {
+ "epoch": 2.860501567398119,
+ "grad_norm": 0.5339663624763489,
+ "learning_rate": 3.0254587404143604e-06,
+ "loss": 0.4792,
+ "step": 607
+ },
+ {
+ "epoch": 2.865203761755486,
+ "grad_norm": 0.48184943199157715,
+ "learning_rate": 3.0189052060048464e-06,
+ "loss": 0.4409,
+ "step": 608
+ },
+ {
+ "epoch": 2.8699059561128526,
+ "grad_norm": 0.5102176070213318,
+ "learning_rate": 3.01234794311128e-06,
+ "loss": 0.438,
+ "step": 609
+ },
+ {
+ "epoch": 2.8746081504702197,
+ "grad_norm": 0.5111781358718872,
+ "learning_rate": 3.0057869988494925e-06,
+ "loss": 0.4617,
+ "step": 610
+ },
+ {
+ "epoch": 2.8793103448275863,
+ "grad_norm": 0.5915101766586304,
+ "learning_rate": 2.999222420361767e-06,
+ "loss": 0.4532,
+ "step": 611
+ },
+ {
+ "epoch": 2.884012539184953,
+ "grad_norm": 0.48898932337760925,
+ "learning_rate": 2.9926542548165e-06,
+ "loss": 0.4663,
+ "step": 612
+ },
+ {
+ "epoch": 2.8887147335423196,
+ "grad_norm": 0.4943861961364746,
+ "learning_rate": 2.9860825494078605e-06,
+ "loss": 0.4354,
+ "step": 613
+ },
+ {
+ "epoch": 2.8934169278996866,
+ "grad_norm": 0.5398025512695312,
+ "learning_rate": 2.979507351355454e-06,
+ "loss": 0.4546,
+ "step": 614
+ },
+ {
+ "epoch": 2.8981191222570533,
+ "grad_norm": 0.545421302318573,
+ "learning_rate": 2.972928707903981e-06,
+ "loss": 0.4404,
+ "step": 615
+ },
+ {
+ "epoch": 2.9028213166144203,
+ "grad_norm": 0.5370550751686096,
+ "learning_rate": 2.966346666322898e-06,
+ "loss": 0.4401,
+ "step": 616
+ },
+ {
+ "epoch": 2.907523510971787,
+ "grad_norm": 0.5280672311782837,
+ "learning_rate": 2.9597612739060775e-06,
+ "loss": 0.4172,
+ "step": 617
+ },
+ {
+ "epoch": 2.9122257053291536,
+ "grad_norm": 0.5043423175811768,
+ "learning_rate": 2.9531725779714713e-06,
+ "loss": 0.4487,
+ "step": 618
+ },
+ {
+ "epoch": 2.91692789968652,
+ "grad_norm": 1.961200475692749,
+ "learning_rate": 2.9465806258607653e-06,
+ "loss": 0.4548,
+ "step": 619
+ },
+ {
+ "epoch": 2.9216300940438873,
+ "grad_norm": 0.5286726355552673,
+ "learning_rate": 2.939985464939043e-06,
+ "loss": 0.4566,
+ "step": 620
+ },
+ {
+ "epoch": 2.926332288401254,
+ "grad_norm": 0.5209453105926514,
+ "learning_rate": 2.9333871425944434e-06,
+ "loss": 0.4064,
+ "step": 621
+ },
+ {
+ "epoch": 2.9310344827586206,
+ "grad_norm": 0.47711747884750366,
+ "learning_rate": 2.926785706237822e-06,
+ "loss": 0.4341,
+ "step": 622
+ },
+ {
+ "epoch": 2.9357366771159876,
+ "grad_norm": 0.45926427841186523,
+ "learning_rate": 2.920181203302409e-06,
+ "loss": 0.4256,
+ "step": 623
+ },
+ {
+ "epoch": 2.9404388714733543,
+ "grad_norm": 0.5624600648880005,
+ "learning_rate": 2.91357368124347e-06,
+ "loss": 0.4252,
+ "step": 624
+ },
+ {
+ "epoch": 2.945141065830721,
+ "grad_norm": 0.5101850628852844,
+ "learning_rate": 2.906963187537962e-06,
+ "loss": 0.4352,
+ "step": 625
+ },
+ {
+ "epoch": 2.9498432601880875,
+ "grad_norm": 0.5341358184814453,
+ "learning_rate": 2.9003497696841955e-06,
+ "loss": 0.4132,
+ "step": 626
+ },
+ {
+ "epoch": 2.9545454545454546,
+ "grad_norm": 0.5917084217071533,
+ "learning_rate": 2.8937334752014913e-06,
+ "loss": 0.4693,
+ "step": 627
+ },
+ {
+ "epoch": 2.959247648902821,
+ "grad_norm": 0.793695330619812,
+ "learning_rate": 2.887114351629839e-06,
+ "loss": 0.4431,
+ "step": 628
+ },
+ {
+ "epoch": 2.9639498432601883,
+ "grad_norm": 0.5363728404045105,
+ "learning_rate": 2.8804924465295575e-06,
+ "loss": 0.4672,
+ "step": 629
+ },
+ {
+ "epoch": 2.968652037617555,
+ "grad_norm": 0.4979572892189026,
+ "learning_rate": 2.873867807480951e-06,
+ "loss": 0.4723,
+ "step": 630
+ },
+ {
+ "epoch": 2.9733542319749215,
+ "grad_norm": 0.5310130715370178,
+ "learning_rate": 2.8672404820839676e-06,
+ "loss": 0.4388,
+ "step": 631
+ },
+ {
+ "epoch": 2.978056426332288,
+ "grad_norm": 0.530015766620636,
+ "learning_rate": 2.8606105179578584e-06,
+ "loss": 0.4466,
+ "step": 632
+ },
+ {
+ "epoch": 2.9827586206896552,
+ "grad_norm": 0.5356627702713013,
+ "learning_rate": 2.8539779627408332e-06,
+ "loss": 0.4252,
+ "step": 633
+ },
+ {
+ "epoch": 2.987460815047022,
+ "grad_norm": 0.5290245413780212,
+ "learning_rate": 2.847342864089721e-06,
+ "loss": 0.4453,
+ "step": 634
+ },
+ {
+ "epoch": 2.992163009404389,
+ "grad_norm": 0.471682071685791,
+ "learning_rate": 2.8407052696796255e-06,
+ "loss": 0.43,
+ "step": 635
+ },
+ {
+ "epoch": 2.9968652037617556,
+ "grad_norm": 0.5220829844474792,
+ "learning_rate": 2.834065227203584e-06,
+ "loss": 0.4494,
+ "step": 636
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 1272,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 212,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.095793346762716e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-636/training_args.bin b/checkpoint-636/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7db90ca60ea3c300feb3b7d6e0cb54fc7cfb2060
--- /dev/null
+++ b/checkpoint-636/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51f85402b182fc4b86518e0cb9ca9cbf150300e36000a38f53507b9a8663ad4b
+size 7928
diff --git a/checkpoint-636/zero_to_fp32.py b/checkpoint-636/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-636/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-848/README.md b/checkpoint-848/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b184114a0c28ed3e4c082c18486736dc818166d
--- /dev/null
+++ b/checkpoint-848/README.md
@@ -0,0 +1,202 @@
+---
+base_model: meta-llama/Llama-3.3-70B-Instruct
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-848/adapter_config.json b/checkpoint-848/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dc930e1be2d901773c96d6e6d186c72676cbf328
--- /dev/null
+++ b/checkpoint-848/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "up_proj",
+ "gate_proj",
+ "o_proj",
+ "v_proj",
+ "q_proj",
+ "k_proj",
+ "down_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-848/adapter_model.safetensors b/checkpoint-848/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4651e9718682582fcfbf809fe56d6da04ac1e94b
--- /dev/null
+++ b/checkpoint-848/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d460fd62ed6ee946c17cc4c192d3f03cdccf058ccaf4cc6d7e3ea899b6ef97af
+size 10829849744
diff --git a/checkpoint-848/global_step849/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-848/global_step849/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a70d0783ee925473b68f96f0107eafd68750957e
--- /dev/null
+++ b/checkpoint-848/global_step849/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e7b0a6ba6d17fcbcbc09e5f681956f469dcf6324e86a10c1ddd844f928c039c
+size 21659418140
diff --git a/checkpoint-848/global_step849/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-848/global_step849/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e5454202c8ff148fe5813726d462f15e7bd7eab2
--- /dev/null
+++ b/checkpoint-848/global_step849/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b078434193a32b33b356c683f633460a4794f9e900192159d5d2a760e4d6c68
+size 21659457372
diff --git a/checkpoint-848/global_step849/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-848/global_step849/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ef03734fe4823abffc1e91a9f59e945385624a59
--- /dev/null
+++ b/checkpoint-848/global_step849/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78079ad7baa5b47986205424129989128a1237a9d6d57f308ea14e11434ecfff
+size 21659417820
diff --git a/checkpoint-848/global_step849/mp_rank_00_model_states.pt b/checkpoint-848/global_step849/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..73a8ded003e08e36e28ffa972ddb457b0e360721
--- /dev/null
+++ b/checkpoint-848/global_step849/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38ba4d6e5ca78a4631881e1ea8fd32301aa7928cfc8a96b779194244ebf484e0
+size 11918643933
diff --git a/checkpoint-848/latest b/checkpoint-848/latest
new file mode 100644
index 0000000000000000000000000000000000000000..5be408e7e1edf263132dab2dcb4283627d9a5ab8
--- /dev/null
+++ b/checkpoint-848/latest
@@ -0,0 +1 @@
+global_step849
\ No newline at end of file
diff --git a/checkpoint-848/rng_state_0.pth b/checkpoint-848/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3223be42d8dc08fb0df65ec23bee5e1d264a5623
--- /dev/null
+++ b/checkpoint-848/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df39c7f021bb5054d83a4ab12113028c23a23409fd30e5b70d38614536dc6a7c
+size 14768
diff --git a/checkpoint-848/rng_state_1.pth b/checkpoint-848/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9d30bff68c659a338a99d5eed89f24fee997bfef
--- /dev/null
+++ b/checkpoint-848/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87b29e89c63e047c998f5185fe7f5c732b624f912c71bae76c50284f012a0003
+size 14768
diff --git a/checkpoint-848/rng_state_2.pth b/checkpoint-848/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fa9c4275656df63aace61a76e3634ade09a901c8
--- /dev/null
+++ b/checkpoint-848/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93924fe79395fbe4eb16b4c0dffef9800128319cdd97e6e528d74c478da5f2d7
+size 14768
diff --git a/checkpoint-848/scheduler.pt b/checkpoint-848/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..447bada1383b2336cfc1f6b99fdf0f100000228b
--- /dev/null
+++ b/checkpoint-848/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63f2db0c51820d5f2040718258c9cb2878f6c2b841f42bafb42c1851491d8aa0
+size 1064
diff --git a/checkpoint-848/special_tokens_map.json b/checkpoint-848/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-848/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-848/tokenizer.json b/checkpoint-848/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-848/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-848/tokenizer_config.json b/checkpoint-848/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b
--- /dev/null
+++ b/checkpoint-848/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-848/trainer_state.json b/checkpoint-848/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ed7109637d6c2b0703001ab1ca92ae934b71939
--- /dev/null
+++ b/checkpoint-848/trainer_state.json
@@ -0,0 +1,5969 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.9968652037617556,
+ "eval_steps": 500,
+ "global_step": 848,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.004702194357366771,
+ "grad_norm": 3.1606569290161133,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 1.0072,
+ "step": 1
+ },
+ {
+ "epoch": 0.009404388714733543,
+ "grad_norm": 3.2058725357055664,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 1.0134,
+ "step": 2
+ },
+ {
+ "epoch": 0.014106583072100314,
+ "grad_norm": 2.636291265487671,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 0.9635,
+ "step": 3
+ },
+ {
+ "epoch": 0.018808777429467086,
+ "grad_norm": 2.708746910095215,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 1.0068,
+ "step": 4
+ },
+ {
+ "epoch": 0.023510971786833857,
+ "grad_norm": 2.8948426246643066,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 0.9608,
+ "step": 5
+ },
+ {
+ "epoch": 0.02821316614420063,
+ "grad_norm": 2.8740086555480957,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 0.9896,
+ "step": 6
+ },
+ {
+ "epoch": 0.032915360501567396,
+ "grad_norm": 2.8338170051574707,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 0.9098,
+ "step": 7
+ },
+ {
+ "epoch": 0.03761755485893417,
+ "grad_norm": 2.7783002853393555,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 0.9733,
+ "step": 8
+ },
+ {
+ "epoch": 0.04231974921630094,
+ "grad_norm": 3.043574333190918,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 0.9943,
+ "step": 9
+ },
+ {
+ "epoch": 0.047021943573667714,
+ "grad_norm": 3.142383337020874,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 0.9475,
+ "step": 10
+ },
+ {
+ "epoch": 0.05172413793103448,
+ "grad_norm": 2.9817280769348145,
+ "learning_rate": 5.5e-07,
+ "loss": 0.9701,
+ "step": 11
+ },
+ {
+ "epoch": 0.05642633228840126,
+ "grad_norm": 2.95699405670166,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 0.9983,
+ "step": 12
+ },
+ {
+ "epoch": 0.061128526645768025,
+ "grad_norm": 2.8782453536987305,
+ "learning_rate": 6.5e-07,
+ "loss": 0.9502,
+ "step": 13
+ },
+ {
+ "epoch": 0.06583072100313479,
+ "grad_norm": 2.6715071201324463,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 0.9436,
+ "step": 14
+ },
+ {
+ "epoch": 0.07053291536050156,
+ "grad_norm": 3.869649648666382,
+ "learning_rate": 7.5e-07,
+ "loss": 0.9692,
+ "step": 15
+ },
+ {
+ "epoch": 0.07523510971786834,
+ "grad_norm": 3.060220956802368,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 0.9258,
+ "step": 16
+ },
+ {
+ "epoch": 0.07993730407523511,
+ "grad_norm": 2.8922741413116455,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 0.9719,
+ "step": 17
+ },
+ {
+ "epoch": 0.08463949843260188,
+ "grad_norm": 2.7857820987701416,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 0.9072,
+ "step": 18
+ },
+ {
+ "epoch": 0.08934169278996865,
+ "grad_norm": 2.9753293991088867,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 0.9032,
+ "step": 19
+ },
+ {
+ "epoch": 0.09404388714733543,
+ "grad_norm": 2.7989683151245117,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 0.8887,
+ "step": 20
+ },
+ {
+ "epoch": 0.0987460815047022,
+ "grad_norm": 2.3953049182891846,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 0.8968,
+ "step": 21
+ },
+ {
+ "epoch": 0.10344827586206896,
+ "grad_norm": 2.643731117248535,
+ "learning_rate": 1.1e-06,
+ "loss": 0.8501,
+ "step": 22
+ },
+ {
+ "epoch": 0.10815047021943573,
+ "grad_norm": 2.3679006099700928,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 0.8476,
+ "step": 23
+ },
+ {
+ "epoch": 0.11285266457680251,
+ "grad_norm": 2.5935540199279785,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 0.8095,
+ "step": 24
+ },
+ {
+ "epoch": 0.11755485893416928,
+ "grad_norm": 2.510300636291504,
+ "learning_rate": 1.25e-06,
+ "loss": 0.8099,
+ "step": 25
+ },
+ {
+ "epoch": 0.12225705329153605,
+ "grad_norm": 2.372344970703125,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7869,
+ "step": 26
+ },
+ {
+ "epoch": 0.12695924764890282,
+ "grad_norm": 2.303426504135132,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.7758,
+ "step": 27
+ },
+ {
+ "epoch": 0.13166144200626959,
+ "grad_norm": 1.9017939567565918,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.7498,
+ "step": 28
+ },
+ {
+ "epoch": 0.13636363636363635,
+ "grad_norm": 1.8810580968856812,
+ "learning_rate": 1.45e-06,
+ "loss": 0.7878,
+ "step": 29
+ },
+ {
+ "epoch": 0.14106583072100312,
+ "grad_norm": 1.7797424793243408,
+ "learning_rate": 1.5e-06,
+ "loss": 0.7747,
+ "step": 30
+ },
+ {
+ "epoch": 0.14576802507836992,
+ "grad_norm": 1.5053879022598267,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.7735,
+ "step": 31
+ },
+ {
+ "epoch": 0.15047021943573669,
+ "grad_norm": 1.4909234046936035,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.7654,
+ "step": 32
+ },
+ {
+ "epoch": 0.15517241379310345,
+ "grad_norm": 1.36083984375,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.6895,
+ "step": 33
+ },
+ {
+ "epoch": 0.15987460815047022,
+ "grad_norm": 1.536014199256897,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.675,
+ "step": 34
+ },
+ {
+ "epoch": 0.164576802507837,
+ "grad_norm": 1.3426779508590698,
+ "learning_rate": 1.75e-06,
+ "loss": 0.7652,
+ "step": 35
+ },
+ {
+ "epoch": 0.16927899686520376,
+ "grad_norm": 1.4900612831115723,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.6863,
+ "step": 36
+ },
+ {
+ "epoch": 0.17398119122257052,
+ "grad_norm": 1.181241750717163,
+ "learning_rate": 1.85e-06,
+ "loss": 0.7136,
+ "step": 37
+ },
+ {
+ "epoch": 0.1786833855799373,
+ "grad_norm": 1.461419701576233,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.7606,
+ "step": 38
+ },
+ {
+ "epoch": 0.1833855799373041,
+ "grad_norm": 1.04817795753479,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.6829,
+ "step": 39
+ },
+ {
+ "epoch": 0.18808777429467086,
+ "grad_norm": 1.0499993562698364,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.7144,
+ "step": 40
+ },
+ {
+ "epoch": 0.19278996865203762,
+ "grad_norm": 0.9935064315795898,
+ "learning_rate": 2.05e-06,
+ "loss": 0.6736,
+ "step": 41
+ },
+ {
+ "epoch": 0.1974921630094044,
+ "grad_norm": 0.9919099807739258,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.7151,
+ "step": 42
+ },
+ {
+ "epoch": 0.20219435736677116,
+ "grad_norm": 0.919556200504303,
+ "learning_rate": 2.15e-06,
+ "loss": 0.6847,
+ "step": 43
+ },
+ {
+ "epoch": 0.20689655172413793,
+ "grad_norm": 1.4762015342712402,
+ "learning_rate": 2.2e-06,
+ "loss": 0.6694,
+ "step": 44
+ },
+ {
+ "epoch": 0.2115987460815047,
+ "grad_norm": 0.9243163466453552,
+ "learning_rate": 2.25e-06,
+ "loss": 0.6489,
+ "step": 45
+ },
+ {
+ "epoch": 0.21630094043887146,
+ "grad_norm": 0.7614469528198242,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.6568,
+ "step": 46
+ },
+ {
+ "epoch": 0.22100313479623823,
+ "grad_norm": 0.7543922662734985,
+ "learning_rate": 2.35e-06,
+ "loss": 0.6359,
+ "step": 47
+ },
+ {
+ "epoch": 0.22570532915360503,
+ "grad_norm": 0.7558912038803101,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.6231,
+ "step": 48
+ },
+ {
+ "epoch": 0.2304075235109718,
+ "grad_norm": 0.7822129130363464,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.6691,
+ "step": 49
+ },
+ {
+ "epoch": 0.23510971786833856,
+ "grad_norm": 0.8646999597549438,
+ "learning_rate": 2.5e-06,
+ "loss": 0.682,
+ "step": 50
+ },
+ {
+ "epoch": 0.23981191222570533,
+ "grad_norm": 0.8824774622917175,
+ "learning_rate": 2.55e-06,
+ "loss": 0.6805,
+ "step": 51
+ },
+ {
+ "epoch": 0.2445141065830721,
+ "grad_norm": 0.7697399258613586,
+ "learning_rate": 2.6e-06,
+ "loss": 0.6368,
+ "step": 52
+ },
+ {
+ "epoch": 0.24921630094043887,
+ "grad_norm": 0.6522512435913086,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.6367,
+ "step": 53
+ },
+ {
+ "epoch": 0.25391849529780564,
+ "grad_norm": 0.6172305941581726,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.6291,
+ "step": 54
+ },
+ {
+ "epoch": 0.25862068965517243,
+ "grad_norm": 0.7860460877418518,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.6736,
+ "step": 55
+ },
+ {
+ "epoch": 0.26332288401253917,
+ "grad_norm": 0.6474862694740295,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.6365,
+ "step": 56
+ },
+ {
+ "epoch": 0.26802507836990597,
+ "grad_norm": 0.6867114901542664,
+ "learning_rate": 2.85e-06,
+ "loss": 0.6397,
+ "step": 57
+ },
+ {
+ "epoch": 0.2727272727272727,
+ "grad_norm": 0.7056852579116821,
+ "learning_rate": 2.9e-06,
+ "loss": 0.6138,
+ "step": 58
+ },
+ {
+ "epoch": 0.2774294670846395,
+ "grad_norm": 0.6615664958953857,
+ "learning_rate": 2.95e-06,
+ "loss": 0.6482,
+ "step": 59
+ },
+ {
+ "epoch": 0.28213166144200624,
+ "grad_norm": 0.6649022102355957,
+ "learning_rate": 3e-06,
+ "loss": 0.6745,
+ "step": 60
+ },
+ {
+ "epoch": 0.28683385579937304,
+ "grad_norm": 0.850848913192749,
+ "learning_rate": 3.05e-06,
+ "loss": 0.5956,
+ "step": 61
+ },
+ {
+ "epoch": 0.29153605015673983,
+ "grad_norm": 0.5983562469482422,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.5894,
+ "step": 62
+ },
+ {
+ "epoch": 0.2962382445141066,
+ "grad_norm": 0.6286782622337341,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.6329,
+ "step": 63
+ },
+ {
+ "epoch": 0.30094043887147337,
+ "grad_norm": 0.5919945240020752,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.6402,
+ "step": 64
+ },
+ {
+ "epoch": 0.3056426332288401,
+ "grad_norm": 0.5632765889167786,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.5862,
+ "step": 65
+ },
+ {
+ "epoch": 0.3103448275862069,
+ "grad_norm": 0.7692590951919556,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.6031,
+ "step": 66
+ },
+ {
+ "epoch": 0.31504702194357365,
+ "grad_norm": 0.7313893437385559,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.6312,
+ "step": 67
+ },
+ {
+ "epoch": 0.31974921630094044,
+ "grad_norm": 0.6097120642662048,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.5986,
+ "step": 68
+ },
+ {
+ "epoch": 0.32445141065830724,
+ "grad_norm": 0.5853808522224426,
+ "learning_rate": 3.45e-06,
+ "loss": 0.5847,
+ "step": 69
+ },
+ {
+ "epoch": 0.329153605015674,
+ "grad_norm": 0.6093555092811584,
+ "learning_rate": 3.5e-06,
+ "loss": 0.6552,
+ "step": 70
+ },
+ {
+ "epoch": 0.3338557993730408,
+ "grad_norm": 0.6106334328651428,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.6196,
+ "step": 71
+ },
+ {
+ "epoch": 0.3385579937304075,
+ "grad_norm": 0.9254828691482544,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.6005,
+ "step": 72
+ },
+ {
+ "epoch": 0.3432601880877743,
+ "grad_norm": 0.5471694469451904,
+ "learning_rate": 3.65e-06,
+ "loss": 0.5907,
+ "step": 73
+ },
+ {
+ "epoch": 0.34796238244514105,
+ "grad_norm": 0.6204228401184082,
+ "learning_rate": 3.7e-06,
+ "loss": 0.6079,
+ "step": 74
+ },
+ {
+ "epoch": 0.35266457680250785,
+ "grad_norm": 0.52458256483078,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.6001,
+ "step": 75
+ },
+ {
+ "epoch": 0.3573667711598746,
+ "grad_norm": 0.5356763601303101,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.5987,
+ "step": 76
+ },
+ {
+ "epoch": 0.3620689655172414,
+ "grad_norm": 0.5408467054367065,
+ "learning_rate": 3.85e-06,
+ "loss": 0.6104,
+ "step": 77
+ },
+ {
+ "epoch": 0.3667711598746082,
+ "grad_norm": 0.5075871348381042,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.5569,
+ "step": 78
+ },
+ {
+ "epoch": 0.3714733542319749,
+ "grad_norm": 0.8474109768867493,
+ "learning_rate": 3.95e-06,
+ "loss": 0.6195,
+ "step": 79
+ },
+ {
+ "epoch": 0.3761755485893417,
+ "grad_norm": 0.4750897288322449,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.5399,
+ "step": 80
+ },
+ {
+ "epoch": 0.38087774294670845,
+ "grad_norm": 0.5082002878189087,
+ "learning_rate": 4.05e-06,
+ "loss": 0.5997,
+ "step": 81
+ },
+ {
+ "epoch": 0.38557993730407525,
+ "grad_norm": 0.5343796014785767,
+ "learning_rate": 4.1e-06,
+ "loss": 0.5704,
+ "step": 82
+ },
+ {
+ "epoch": 0.390282131661442,
+ "grad_norm": 0.520311713218689,
+ "learning_rate": 4.15e-06,
+ "loss": 0.5818,
+ "step": 83
+ },
+ {
+ "epoch": 0.3949843260188088,
+ "grad_norm": 0.5292978286743164,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.5852,
+ "step": 84
+ },
+ {
+ "epoch": 0.3996865203761755,
+ "grad_norm": 0.539886474609375,
+ "learning_rate": 4.25e-06,
+ "loss": 0.6057,
+ "step": 85
+ },
+ {
+ "epoch": 0.4043887147335423,
+ "grad_norm": 0.6468827128410339,
+ "learning_rate": 4.3e-06,
+ "loss": 0.6122,
+ "step": 86
+ },
+ {
+ "epoch": 0.4090909090909091,
+ "grad_norm": 0.5537365078926086,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.5652,
+ "step": 87
+ },
+ {
+ "epoch": 0.41379310344827586,
+ "grad_norm": 0.6226018667221069,
+ "learning_rate": 4.4e-06,
+ "loss": 0.5884,
+ "step": 88
+ },
+ {
+ "epoch": 0.41849529780564265,
+ "grad_norm": 0.5016945004463196,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.5877,
+ "step": 89
+ },
+ {
+ "epoch": 0.4231974921630094,
+ "grad_norm": 0.5059167146682739,
+ "learning_rate": 4.5e-06,
+ "loss": 0.5676,
+ "step": 90
+ },
+ {
+ "epoch": 0.4278996865203762,
+ "grad_norm": 0.47521743178367615,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.5929,
+ "step": 91
+ },
+ {
+ "epoch": 0.43260188087774293,
+ "grad_norm": 0.531306266784668,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.5983,
+ "step": 92
+ },
+ {
+ "epoch": 0.4373040752351097,
+ "grad_norm": 0.4965567886829376,
+ "learning_rate": 4.65e-06,
+ "loss": 0.5279,
+ "step": 93
+ },
+ {
+ "epoch": 0.44200626959247646,
+ "grad_norm": 0.5125988125801086,
+ "learning_rate": 4.7e-06,
+ "loss": 0.5436,
+ "step": 94
+ },
+ {
+ "epoch": 0.44670846394984326,
+ "grad_norm": 0.557763934135437,
+ "learning_rate": 4.75e-06,
+ "loss": 0.5496,
+ "step": 95
+ },
+ {
+ "epoch": 0.45141065830721006,
+ "grad_norm": 0.6993274092674255,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.5498,
+ "step": 96
+ },
+ {
+ "epoch": 0.4561128526645768,
+ "grad_norm": 0.5485453009605408,
+ "learning_rate": 4.85e-06,
+ "loss": 0.5552,
+ "step": 97
+ },
+ {
+ "epoch": 0.4608150470219436,
+ "grad_norm": 1.9821522235870361,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.569,
+ "step": 98
+ },
+ {
+ "epoch": 0.46551724137931033,
+ "grad_norm": 0.6074144840240479,
+ "learning_rate": 4.95e-06,
+ "loss": 0.5546,
+ "step": 99
+ },
+ {
+ "epoch": 0.4702194357366771,
+ "grad_norm": 0.5404040813446045,
+ "learning_rate": 5e-06,
+ "loss": 0.5775,
+ "step": 100
+ },
+ {
+ "epoch": 0.47492163009404387,
+ "grad_norm": 0.500438928604126,
+ "learning_rate": 4.9999910183883085e-06,
+ "loss": 0.5569,
+ "step": 101
+ },
+ {
+ "epoch": 0.47962382445141066,
+ "grad_norm": 0.5036981701850891,
+ "learning_rate": 4.999964073617768e-06,
+ "loss": 0.5663,
+ "step": 102
+ },
+ {
+ "epoch": 0.4843260188087774,
+ "grad_norm": 0.4537642300128937,
+ "learning_rate": 4.999919165881985e-06,
+ "loss": 0.5527,
+ "step": 103
+ },
+ {
+ "epoch": 0.4890282131661442,
+ "grad_norm": 0.49653521180152893,
+ "learning_rate": 4.999856295503635e-06,
+ "loss": 0.563,
+ "step": 104
+ },
+ {
+ "epoch": 0.493730407523511,
+ "grad_norm": 0.46847566962242126,
+ "learning_rate": 4.9997754629344596e-06,
+ "loss": 0.5425,
+ "step": 105
+ },
+ {
+ "epoch": 0.49843260188087773,
+ "grad_norm": 0.5192411541938782,
+ "learning_rate": 4.999676668755263e-06,
+ "loss": 0.5315,
+ "step": 106
+ },
+ {
+ "epoch": 0.5031347962382445,
+ "grad_norm": 0.5170287489891052,
+ "learning_rate": 4.999559913675912e-06,
+ "loss": 0.5627,
+ "step": 107
+ },
+ {
+ "epoch": 0.5078369905956113,
+ "grad_norm": 0.47297438979148865,
+ "learning_rate": 4.999425198535325e-06,
+ "loss": 0.5432,
+ "step": 108
+ },
+ {
+ "epoch": 0.512539184952978,
+ "grad_norm": 0.4873776137828827,
+ "learning_rate": 4.999272524301469e-06,
+ "loss": 0.5473,
+ "step": 109
+ },
+ {
+ "epoch": 0.5172413793103449,
+ "grad_norm": 0.5432935357093811,
+ "learning_rate": 4.9991018920713505e-06,
+ "loss": 0.5642,
+ "step": 110
+ },
+ {
+ "epoch": 0.5219435736677116,
+ "grad_norm": 0.4850105345249176,
+ "learning_rate": 4.9989133030710154e-06,
+ "loss": 0.548,
+ "step": 111
+ },
+ {
+ "epoch": 0.5266457680250783,
+ "grad_norm": 0.9399585723876953,
+ "learning_rate": 4.9987067586555275e-06,
+ "loss": 0.5471,
+ "step": 112
+ },
+ {
+ "epoch": 0.5313479623824452,
+ "grad_norm": 0.5167811512947083,
+ "learning_rate": 4.998482260308969e-06,
+ "loss": 0.5648,
+ "step": 113
+ },
+ {
+ "epoch": 0.5360501567398119,
+ "grad_norm": 0.5069029927253723,
+ "learning_rate": 4.998239809644427e-06,
+ "loss": 0.5568,
+ "step": 114
+ },
+ {
+ "epoch": 0.5407523510971787,
+ "grad_norm": 0.8738563656806946,
+ "learning_rate": 4.9979794084039755e-06,
+ "loss": 0.5719,
+ "step": 115
+ },
+ {
+ "epoch": 0.5454545454545454,
+ "grad_norm": 0.5216553807258606,
+ "learning_rate": 4.997701058458677e-06,
+ "loss": 0.5309,
+ "step": 116
+ },
+ {
+ "epoch": 0.5501567398119123,
+ "grad_norm": 0.9678344130516052,
+ "learning_rate": 4.997404761808554e-06,
+ "loss": 0.5645,
+ "step": 117
+ },
+ {
+ "epoch": 0.554858934169279,
+ "grad_norm": 0.496598482131958,
+ "learning_rate": 4.9970905205825845e-06,
+ "loss": 0.5711,
+ "step": 118
+ },
+ {
+ "epoch": 0.5595611285266457,
+ "grad_norm": 0.4745199680328369,
+ "learning_rate": 4.996758337038683e-06,
+ "loss": 0.5613,
+ "step": 119
+ },
+ {
+ "epoch": 0.5642633228840125,
+ "grad_norm": 0.5595977902412415,
+ "learning_rate": 4.996408213563684e-06,
+ "loss": 0.5559,
+ "step": 120
+ },
+ {
+ "epoch": 0.5689655172413793,
+ "grad_norm": 0.4743712544441223,
+ "learning_rate": 4.996040152673326e-06,
+ "loss": 0.5228,
+ "step": 121
+ },
+ {
+ "epoch": 0.5736677115987461,
+ "grad_norm": 0.5418100953102112,
+ "learning_rate": 4.995654157012233e-06,
+ "loss": 0.536,
+ "step": 122
+ },
+ {
+ "epoch": 0.5783699059561128,
+ "grad_norm": 0.521977424621582,
+ "learning_rate": 4.995250229353895e-06,
+ "loss": 0.5305,
+ "step": 123
+ },
+ {
+ "epoch": 0.5830721003134797,
+ "grad_norm": 0.5062761902809143,
+ "learning_rate": 4.99482837260065e-06,
+ "loss": 0.5417,
+ "step": 124
+ },
+ {
+ "epoch": 0.5877742946708464,
+ "grad_norm": 0.5895913243293762,
+ "learning_rate": 4.99438858978366e-06,
+ "loss": 0.573,
+ "step": 125
+ },
+ {
+ "epoch": 0.5924764890282131,
+ "grad_norm": 0.5442466139793396,
+ "learning_rate": 4.993930884062892e-06,
+ "loss": 0.5563,
+ "step": 126
+ },
+ {
+ "epoch": 0.5971786833855799,
+ "grad_norm": 0.5130571722984314,
+ "learning_rate": 4.993455258727094e-06,
+ "loss": 0.5549,
+ "step": 127
+ },
+ {
+ "epoch": 0.6018808777429467,
+ "grad_norm": 0.5579081773757935,
+ "learning_rate": 4.992961717193773e-06,
+ "loss": 0.5554,
+ "step": 128
+ },
+ {
+ "epoch": 0.6065830721003135,
+ "grad_norm": 0.6375890374183655,
+ "learning_rate": 4.9924502630091655e-06,
+ "loss": 0.5626,
+ "step": 129
+ },
+ {
+ "epoch": 0.6112852664576802,
+ "grad_norm": 0.5129190683364868,
+ "learning_rate": 4.99192089984822e-06,
+ "loss": 0.5493,
+ "step": 130
+ },
+ {
+ "epoch": 0.6159874608150471,
+ "grad_norm": 0.5293419361114502,
+ "learning_rate": 4.9913736315145614e-06,
+ "loss": 0.5565,
+ "step": 131
+ },
+ {
+ "epoch": 0.6206896551724138,
+ "grad_norm": 0.6502572298049927,
+ "learning_rate": 4.990808461940474e-06,
+ "loss": 0.5358,
+ "step": 132
+ },
+ {
+ "epoch": 0.6253918495297806,
+ "grad_norm": 0.5450296998023987,
+ "learning_rate": 4.990225395186862e-06,
+ "loss": 0.5421,
+ "step": 133
+ },
+ {
+ "epoch": 0.6300940438871473,
+ "grad_norm": 0.45506399869918823,
+ "learning_rate": 4.9896244354432314e-06,
+ "loss": 0.5396,
+ "step": 134
+ },
+ {
+ "epoch": 0.6347962382445141,
+ "grad_norm": 0.5095545649528503,
+ "learning_rate": 4.98900558702765e-06,
+ "loss": 0.5486,
+ "step": 135
+ },
+ {
+ "epoch": 0.6394984326018809,
+ "grad_norm": 0.4836446940898895,
+ "learning_rate": 4.9883688543867225e-06,
+ "loss": 0.5596,
+ "step": 136
+ },
+ {
+ "epoch": 0.6442006269592476,
+ "grad_norm": 0.5253512859344482,
+ "learning_rate": 4.987714242095558e-06,
+ "loss": 0.5308,
+ "step": 137
+ },
+ {
+ "epoch": 0.6489028213166145,
+ "grad_norm": 0.8280164003372192,
+ "learning_rate": 4.9870417548577355e-06,
+ "loss": 0.5349,
+ "step": 138
+ },
+ {
+ "epoch": 0.6536050156739812,
+ "grad_norm": 0.4729730188846588,
+ "learning_rate": 4.9863513975052696e-06,
+ "loss": 0.5416,
+ "step": 139
+ },
+ {
+ "epoch": 0.658307210031348,
+ "grad_norm": 0.5932718515396118,
+ "learning_rate": 4.985643174998578e-06,
+ "loss": 0.5638,
+ "step": 140
+ },
+ {
+ "epoch": 0.6630094043887147,
+ "grad_norm": 0.5187026262283325,
+ "learning_rate": 4.984917092426445e-06,
+ "loss": 0.5507,
+ "step": 141
+ },
+ {
+ "epoch": 0.6677115987460815,
+ "grad_norm": 0.5024245977401733,
+ "learning_rate": 4.984173155005982e-06,
+ "loss": 0.5406,
+ "step": 142
+ },
+ {
+ "epoch": 0.6724137931034483,
+ "grad_norm": 0.4735509157180786,
+ "learning_rate": 4.983411368082597e-06,
+ "loss": 0.5431,
+ "step": 143
+ },
+ {
+ "epoch": 0.677115987460815,
+ "grad_norm": 0.5040024518966675,
+ "learning_rate": 4.982631737129948e-06,
+ "loss": 0.5291,
+ "step": 144
+ },
+ {
+ "epoch": 0.6818181818181818,
+ "grad_norm": 0.47764894366264343,
+ "learning_rate": 4.98183426774991e-06,
+ "loss": 0.5677,
+ "step": 145
+ },
+ {
+ "epoch": 0.6865203761755486,
+ "grad_norm": 0.5211489796638489,
+ "learning_rate": 4.981018965672529e-06,
+ "loss": 0.566,
+ "step": 146
+ },
+ {
+ "epoch": 0.6912225705329154,
+ "grad_norm": 1.022007942199707,
+ "learning_rate": 4.98018583675599e-06,
+ "loss": 0.5476,
+ "step": 147
+ },
+ {
+ "epoch": 0.6959247648902821,
+ "grad_norm": 0.5263912677764893,
+ "learning_rate": 4.979334886986562e-06,
+ "loss": 0.5473,
+ "step": 148
+ },
+ {
+ "epoch": 0.700626959247649,
+ "grad_norm": 0.5014091730117798,
+ "learning_rate": 4.978466122478567e-06,
+ "loss": 0.5642,
+ "step": 149
+ },
+ {
+ "epoch": 0.7053291536050157,
+ "grad_norm": 0.5003350973129272,
+ "learning_rate": 4.97757954947433e-06,
+ "loss": 0.5311,
+ "step": 150
+ },
+ {
+ "epoch": 0.7100313479623824,
+ "grad_norm": 0.5010690093040466,
+ "learning_rate": 4.976675174344132e-06,
+ "loss": 0.5469,
+ "step": 151
+ },
+ {
+ "epoch": 0.7147335423197492,
+ "grad_norm": 0.45779237151145935,
+ "learning_rate": 4.975753003586172e-06,
+ "loss": 0.5273,
+ "step": 152
+ },
+ {
+ "epoch": 0.719435736677116,
+ "grad_norm": 0.6231539845466614,
+ "learning_rate": 4.974813043826513e-06,
+ "loss": 0.5182,
+ "step": 153
+ },
+ {
+ "epoch": 0.7241379310344828,
+ "grad_norm": 0.5361394286155701,
+ "learning_rate": 4.973855301819039e-06,
+ "loss": 0.5372,
+ "step": 154
+ },
+ {
+ "epoch": 0.7288401253918495,
+ "grad_norm": 0.5193538665771484,
+ "learning_rate": 4.972879784445402e-06,
+ "loss": 0.5201,
+ "step": 155
+ },
+ {
+ "epoch": 0.7335423197492164,
+ "grad_norm": 0.47956809401512146,
+ "learning_rate": 4.971886498714978e-06,
+ "loss": 0.5402,
+ "step": 156
+ },
+ {
+ "epoch": 0.7382445141065831,
+ "grad_norm": 0.5303016901016235,
+ "learning_rate": 4.97087545176481e-06,
+ "loss": 0.5174,
+ "step": 157
+ },
+ {
+ "epoch": 0.7429467084639498,
+ "grad_norm": 0.5002286434173584,
+ "learning_rate": 4.9698466508595655e-06,
+ "loss": 0.5453,
+ "step": 158
+ },
+ {
+ "epoch": 0.7476489028213166,
+ "grad_norm": 0.6070297360420227,
+ "learning_rate": 4.9688001033914756e-06,
+ "loss": 0.5327,
+ "step": 159
+ },
+ {
+ "epoch": 0.7523510971786834,
+ "grad_norm": 0.5436793565750122,
+ "learning_rate": 4.967735816880286e-06,
+ "loss": 0.544,
+ "step": 160
+ },
+ {
+ "epoch": 0.7570532915360502,
+ "grad_norm": 0.538012683391571,
+ "learning_rate": 4.966653798973205e-06,
+ "loss": 0.5233,
+ "step": 161
+ },
+ {
+ "epoch": 0.7617554858934169,
+ "grad_norm": 0.4916169345378876,
+ "learning_rate": 4.965554057444842e-06,
+ "loss": 0.5168,
+ "step": 162
+ },
+ {
+ "epoch": 0.7664576802507836,
+ "grad_norm": 0.48281437158584595,
+ "learning_rate": 4.964436600197161e-06,
+ "loss": 0.5393,
+ "step": 163
+ },
+ {
+ "epoch": 0.7711598746081505,
+ "grad_norm": 0.5184990167617798,
+ "learning_rate": 4.963301435259413e-06,
+ "loss": 0.5085,
+ "step": 164
+ },
+ {
+ "epoch": 0.7758620689655172,
+ "grad_norm": 0.4706438183784485,
+ "learning_rate": 4.962148570788088e-06,
+ "loss": 0.5299,
+ "step": 165
+ },
+ {
+ "epoch": 0.780564263322884,
+ "grad_norm": 0.6550764441490173,
+ "learning_rate": 4.96097801506685e-06,
+ "loss": 0.5192,
+ "step": 166
+ },
+ {
+ "epoch": 0.7852664576802508,
+ "grad_norm": 0.5386581420898438,
+ "learning_rate": 4.959789776506482e-06,
+ "loss": 0.5258,
+ "step": 167
+ },
+ {
+ "epoch": 0.7899686520376176,
+ "grad_norm": 0.5060779452323914,
+ "learning_rate": 4.958583863644821e-06,
+ "loss": 0.5512,
+ "step": 168
+ },
+ {
+ "epoch": 0.7946708463949843,
+ "grad_norm": 0.47050032019615173,
+ "learning_rate": 4.9573602851466985e-06,
+ "loss": 0.5176,
+ "step": 169
+ },
+ {
+ "epoch": 0.799373040752351,
+ "grad_norm": 7.3139567375183105,
+ "learning_rate": 4.9561190498038815e-06,
+ "loss": 0.5381,
+ "step": 170
+ },
+ {
+ "epoch": 0.8040752351097179,
+ "grad_norm": 0.620528519153595,
+ "learning_rate": 4.954860166535005e-06,
+ "loss": 0.5299,
+ "step": 171
+ },
+ {
+ "epoch": 0.8087774294670846,
+ "grad_norm": 0.45067766308784485,
+ "learning_rate": 4.95358364438551e-06,
+ "loss": 0.5328,
+ "step": 172
+ },
+ {
+ "epoch": 0.8134796238244514,
+ "grad_norm": 0.6771508455276489,
+ "learning_rate": 4.952289492527576e-06,
+ "loss": 0.5601,
+ "step": 173
+ },
+ {
+ "epoch": 0.8181818181818182,
+ "grad_norm": 0.518925130367279,
+ "learning_rate": 4.9509777202600605e-06,
+ "loss": 0.494,
+ "step": 174
+ },
+ {
+ "epoch": 0.822884012539185,
+ "grad_norm": 0.5191988945007324,
+ "learning_rate": 4.949648337008425e-06,
+ "loss": 0.5425,
+ "step": 175
+ },
+ {
+ "epoch": 0.8275862068965517,
+ "grad_norm": 0.8600963354110718,
+ "learning_rate": 4.948301352324674e-06,
+ "loss": 0.5332,
+ "step": 176
+ },
+ {
+ "epoch": 0.8322884012539185,
+ "grad_norm": 0.5405915379524231,
+ "learning_rate": 4.946936775887281e-06,
+ "loss": 0.5276,
+ "step": 177
+ },
+ {
+ "epoch": 0.8369905956112853,
+ "grad_norm": 0.48730772733688354,
+ "learning_rate": 4.945554617501124e-06,
+ "loss": 0.5217,
+ "step": 178
+ },
+ {
+ "epoch": 0.841692789968652,
+ "grad_norm": 0.5092865824699402,
+ "learning_rate": 4.944154887097411e-06,
+ "loss": 0.5534,
+ "step": 179
+ },
+ {
+ "epoch": 0.8463949843260188,
+ "grad_norm": 0.4994933605194092,
+ "learning_rate": 4.942737594733608e-06,
+ "loss": 0.5242,
+ "step": 180
+ },
+ {
+ "epoch": 0.8510971786833855,
+ "grad_norm": 0.4554043412208557,
+ "learning_rate": 4.941302750593373e-06,
+ "loss": 0.5424,
+ "step": 181
+ },
+ {
+ "epoch": 0.8557993730407524,
+ "grad_norm": 0.4865265488624573,
+ "learning_rate": 4.939850364986475e-06,
+ "loss": 0.482,
+ "step": 182
+ },
+ {
+ "epoch": 0.8605015673981191,
+ "grad_norm": 0.5013875365257263,
+ "learning_rate": 4.938380448348725e-06,
+ "loss": 0.4908,
+ "step": 183
+ },
+ {
+ "epoch": 0.8652037617554859,
+ "grad_norm": 0.4997917115688324,
+ "learning_rate": 4.9368930112419e-06,
+ "loss": 0.5336,
+ "step": 184
+ },
+ {
+ "epoch": 0.8699059561128527,
+ "grad_norm": 0.4783482551574707,
+ "learning_rate": 4.935388064353665e-06,
+ "loss": 0.5338,
+ "step": 185
+ },
+ {
+ "epoch": 0.8746081504702194,
+ "grad_norm": 0.7221089005470276,
+ "learning_rate": 4.9338656184975e-06,
+ "loss": 0.5327,
+ "step": 186
+ },
+ {
+ "epoch": 0.8793103448275862,
+ "grad_norm": 0.48115843534469604,
+ "learning_rate": 4.932325684612618e-06,
+ "loss": 0.5408,
+ "step": 187
+ },
+ {
+ "epoch": 0.8840125391849529,
+ "grad_norm": 0.4940219223499298,
+ "learning_rate": 4.93076827376389e-06,
+ "loss": 0.5455,
+ "step": 188
+ },
+ {
+ "epoch": 0.8887147335423198,
+ "grad_norm": 0.4754747450351715,
+ "learning_rate": 4.9291933971417635e-06,
+ "loss": 0.542,
+ "step": 189
+ },
+ {
+ "epoch": 0.8934169278996865,
+ "grad_norm": 0.548713207244873,
+ "learning_rate": 4.9276010660621835e-06,
+ "loss": 0.5292,
+ "step": 190
+ },
+ {
+ "epoch": 0.8981191222570533,
+ "grad_norm": 0.7292612195014954,
+ "learning_rate": 4.925991291966508e-06,
+ "loss": 0.5073,
+ "step": 191
+ },
+ {
+ "epoch": 0.9028213166144201,
+ "grad_norm": 0.5254770517349243,
+ "learning_rate": 4.92436408642143e-06,
+ "loss": 0.5451,
+ "step": 192
+ },
+ {
+ "epoch": 0.9075235109717869,
+ "grad_norm": 0.47938767075538635,
+ "learning_rate": 4.9227194611188934e-06,
+ "loss": 0.5204,
+ "step": 193
+ },
+ {
+ "epoch": 0.9122257053291536,
+ "grad_norm": 0.6740232706069946,
+ "learning_rate": 4.921057427876007e-06,
+ "loss": 0.4928,
+ "step": 194
+ },
+ {
+ "epoch": 0.9169278996865203,
+ "grad_norm": 0.5455343723297119,
+ "learning_rate": 4.919377998634959e-06,
+ "loss": 0.5468,
+ "step": 195
+ },
+ {
+ "epoch": 0.9216300940438872,
+ "grad_norm": 0.5001958012580872,
+ "learning_rate": 4.917681185462934e-06,
+ "loss": 0.5339,
+ "step": 196
+ },
+ {
+ "epoch": 0.9263322884012539,
+ "grad_norm": 0.5084257125854492,
+ "learning_rate": 4.915967000552028e-06,
+ "loss": 0.5259,
+ "step": 197
+ },
+ {
+ "epoch": 0.9310344827586207,
+ "grad_norm": 0.4807164967060089,
+ "learning_rate": 4.914235456219154e-06,
+ "loss": 0.5204,
+ "step": 198
+ },
+ {
+ "epoch": 0.9357366771159875,
+ "grad_norm": 0.6099370718002319,
+ "learning_rate": 4.912486564905959e-06,
+ "loss": 0.544,
+ "step": 199
+ },
+ {
+ "epoch": 0.9404388714733543,
+ "grad_norm": 0.47461947798728943,
+ "learning_rate": 4.910720339178735e-06,
+ "loss": 0.5295,
+ "step": 200
+ },
+ {
+ "epoch": 0.945141065830721,
+ "grad_norm": 0.500136137008667,
+ "learning_rate": 4.908936791728323e-06,
+ "loss": 0.5321,
+ "step": 201
+ },
+ {
+ "epoch": 0.9498432601880877,
+ "grad_norm": 0.5235631465911865,
+ "learning_rate": 4.907135935370027e-06,
+ "loss": 0.5338,
+ "step": 202
+ },
+ {
+ "epoch": 0.9545454545454546,
+ "grad_norm": 0.9285804629325867,
+ "learning_rate": 4.905317783043523e-06,
+ "loss": 0.5393,
+ "step": 203
+ },
+ {
+ "epoch": 0.9592476489028213,
+ "grad_norm": 0.4834178388118744,
+ "learning_rate": 4.9034823478127605e-06,
+ "loss": 0.5211,
+ "step": 204
+ },
+ {
+ "epoch": 0.9639498432601881,
+ "grad_norm": 0.4830580949783325,
+ "learning_rate": 4.901629642865872e-06,
+ "loss": 0.4986,
+ "step": 205
+ },
+ {
+ "epoch": 0.9686520376175548,
+ "grad_norm": 0.49718615412712097,
+ "learning_rate": 4.89975968151508e-06,
+ "loss": 0.5204,
+ "step": 206
+ },
+ {
+ "epoch": 0.9733542319749217,
+ "grad_norm": 0.5056726336479187,
+ "learning_rate": 4.8978724771965965e-06,
+ "loss": 0.5133,
+ "step": 207
+ },
+ {
+ "epoch": 0.9780564263322884,
+ "grad_norm": 0.7357563376426697,
+ "learning_rate": 4.895968043470532e-06,
+ "loss": 0.5307,
+ "step": 208
+ },
+ {
+ "epoch": 0.9827586206896551,
+ "grad_norm": 0.515610933303833,
+ "learning_rate": 4.894046394020794e-06,
+ "loss": 0.4955,
+ "step": 209
+ },
+ {
+ "epoch": 0.987460815047022,
+ "grad_norm": 0.5124618411064148,
+ "learning_rate": 4.892107542654988e-06,
+ "loss": 0.526,
+ "step": 210
+ },
+ {
+ "epoch": 0.9921630094043887,
+ "grad_norm": 0.5059565901756287,
+ "learning_rate": 4.890151503304325e-06,
+ "loss": 0.5473,
+ "step": 211
+ },
+ {
+ "epoch": 0.9968652037617555,
+ "grad_norm": 0.4806717336177826,
+ "learning_rate": 4.88817829002351e-06,
+ "loss": 0.5212,
+ "step": 212
+ },
+ {
+ "epoch": 1.0047021943573669,
+ "grad_norm": 0.9454345703125,
+ "learning_rate": 4.886187916990653e-06,
+ "loss": 1.0566,
+ "step": 213
+ },
+ {
+ "epoch": 1.0094043887147335,
+ "grad_norm": 0.4871070086956024,
+ "learning_rate": 4.884180398507163e-06,
+ "loss": 0.503,
+ "step": 214
+ },
+ {
+ "epoch": 1.0141065830721003,
+ "grad_norm": 0.45102012157440186,
+ "learning_rate": 4.882155748997636e-06,
+ "loss": 0.4954,
+ "step": 215
+ },
+ {
+ "epoch": 1.0188087774294672,
+ "grad_norm": 0.49910685420036316,
+ "learning_rate": 4.8801139830097685e-06,
+ "loss": 0.5019,
+ "step": 216
+ },
+ {
+ "epoch": 1.0235109717868338,
+ "grad_norm": 0.5155763030052185,
+ "learning_rate": 4.878055115214238e-06,
+ "loss": 0.5102,
+ "step": 217
+ },
+ {
+ "epoch": 1.0282131661442007,
+ "grad_norm": 0.4567059874534607,
+ "learning_rate": 4.875979160404607e-06,
+ "loss": 0.5069,
+ "step": 218
+ },
+ {
+ "epoch": 1.0329153605015673,
+ "grad_norm": 0.4782896935939789,
+ "learning_rate": 4.873886133497209e-06,
+ "loss": 0.5182,
+ "step": 219
+ },
+ {
+ "epoch": 1.0376175548589341,
+ "grad_norm": 0.44995731115341187,
+ "learning_rate": 4.87177604953105e-06,
+ "loss": 0.513,
+ "step": 220
+ },
+ {
+ "epoch": 1.042319749216301,
+ "grad_norm": 0.470059871673584,
+ "learning_rate": 4.869648923667694e-06,
+ "loss": 0.468,
+ "step": 221
+ },
+ {
+ "epoch": 1.0470219435736676,
+ "grad_norm": 0.5356128215789795,
+ "learning_rate": 4.867504771191154e-06,
+ "loss": 0.4942,
+ "step": 222
+ },
+ {
+ "epoch": 1.0517241379310345,
+ "grad_norm": 0.5137870907783508,
+ "learning_rate": 4.865343607507788e-06,
+ "loss": 0.5022,
+ "step": 223
+ },
+ {
+ "epoch": 1.0564263322884013,
+ "grad_norm": 0.47419992089271545,
+ "learning_rate": 4.86316544814618e-06,
+ "loss": 0.5158,
+ "step": 224
+ },
+ {
+ "epoch": 1.061128526645768,
+ "grad_norm": 0.49087393283843994,
+ "learning_rate": 4.860970308757038e-06,
+ "loss": 0.4605,
+ "step": 225
+ },
+ {
+ "epoch": 1.0658307210031348,
+ "grad_norm": 0.4988348186016083,
+ "learning_rate": 4.858758205113072e-06,
+ "loss": 0.4912,
+ "step": 226
+ },
+ {
+ "epoch": 1.0705329153605017,
+ "grad_norm": 0.44543248414993286,
+ "learning_rate": 4.856529153108888e-06,
+ "loss": 0.524,
+ "step": 227
+ },
+ {
+ "epoch": 1.0752351097178683,
+ "grad_norm": 0.5953351259231567,
+ "learning_rate": 4.854283168760868e-06,
+ "loss": 0.5001,
+ "step": 228
+ },
+ {
+ "epoch": 1.0799373040752351,
+ "grad_norm": 0.5012004375457764,
+ "learning_rate": 4.85202026820706e-06,
+ "loss": 0.4968,
+ "step": 229
+ },
+ {
+ "epoch": 1.084639498432602,
+ "grad_norm": 0.5023937821388245,
+ "learning_rate": 4.84974046770706e-06,
+ "loss": 0.5345,
+ "step": 230
+ },
+ {
+ "epoch": 1.0893416927899686,
+ "grad_norm": 0.4705684185028076,
+ "learning_rate": 4.847443783641893e-06,
+ "loss": 0.4459,
+ "step": 231
+ },
+ {
+ "epoch": 1.0940438871473355,
+ "grad_norm": 0.5082476735115051,
+ "learning_rate": 4.845130232513901e-06,
+ "loss": 0.4905,
+ "step": 232
+ },
+ {
+ "epoch": 1.098746081504702,
+ "grad_norm": 0.5283995866775513,
+ "learning_rate": 4.842799830946615e-06,
+ "loss": 0.4878,
+ "step": 233
+ },
+ {
+ "epoch": 1.103448275862069,
+ "grad_norm": 0.6373623013496399,
+ "learning_rate": 4.840452595684646e-06,
+ "loss": 0.4867,
+ "step": 234
+ },
+ {
+ "epoch": 1.1081504702194358,
+ "grad_norm": 0.4624481201171875,
+ "learning_rate": 4.83808854359356e-06,
+ "loss": 0.4793,
+ "step": 235
+ },
+ {
+ "epoch": 1.1128526645768024,
+ "grad_norm": 0.4659098982810974,
+ "learning_rate": 4.835707691659753e-06,
+ "loss": 0.4827,
+ "step": 236
+ },
+ {
+ "epoch": 1.1175548589341693,
+ "grad_norm": 0.4920850396156311,
+ "learning_rate": 4.8333100569903365e-06,
+ "loss": 0.4932,
+ "step": 237
+ },
+ {
+ "epoch": 1.1222570532915361,
+ "grad_norm": 0.492286741733551,
+ "learning_rate": 4.8308956568130094e-06,
+ "loss": 0.5144,
+ "step": 238
+ },
+ {
+ "epoch": 1.1269592476489028,
+ "grad_norm": 0.5429807901382446,
+ "learning_rate": 4.828464508475934e-06,
+ "loss": 0.5054,
+ "step": 239
+ },
+ {
+ "epoch": 1.1316614420062696,
+ "grad_norm": 2.4671998023986816,
+ "learning_rate": 4.826016629447616e-06,
+ "loss": 0.5073,
+ "step": 240
+ },
+ {
+ "epoch": 1.1363636363636362,
+ "grad_norm": 0.4593118131160736,
+ "learning_rate": 4.823552037316775e-06,
+ "loss": 0.4856,
+ "step": 241
+ },
+ {
+ "epoch": 1.141065830721003,
+ "grad_norm": 0.6855646371841431,
+ "learning_rate": 4.821070749792218e-06,
+ "loss": 0.5388,
+ "step": 242
+ },
+ {
+ "epoch": 1.14576802507837,
+ "grad_norm": 0.5722374320030212,
+ "learning_rate": 4.818572784702713e-06,
+ "loss": 0.51,
+ "step": 243
+ },
+ {
+ "epoch": 1.1504702194357366,
+ "grad_norm": 0.4901357591152191,
+ "learning_rate": 4.816058159996863e-06,
+ "loss": 0.5201,
+ "step": 244
+ },
+ {
+ "epoch": 1.1551724137931034,
+ "grad_norm": 0.4655209481716156,
+ "learning_rate": 4.813526893742972e-06,
+ "loss": 0.501,
+ "step": 245
+ },
+ {
+ "epoch": 1.1598746081504703,
+ "grad_norm": 0.7608394622802734,
+ "learning_rate": 4.810979004128924e-06,
+ "loss": 0.4961,
+ "step": 246
+ },
+ {
+ "epoch": 1.164576802507837,
+ "grad_norm": 0.4857081472873688,
+ "learning_rate": 4.808414509462042e-06,
+ "loss": 0.5174,
+ "step": 247
+ },
+ {
+ "epoch": 1.1692789968652038,
+ "grad_norm": 0.46672946214675903,
+ "learning_rate": 4.80583342816896e-06,
+ "loss": 0.484,
+ "step": 248
+ },
+ {
+ "epoch": 1.1739811912225706,
+ "grad_norm": 0.46982088685035706,
+ "learning_rate": 4.803235778795496e-06,
+ "loss": 0.5236,
+ "step": 249
+ },
+ {
+ "epoch": 1.1786833855799372,
+ "grad_norm": 0.5086098909378052,
+ "learning_rate": 4.800621580006511e-06,
+ "loss": 0.4673,
+ "step": 250
+ },
+ {
+ "epoch": 1.183385579937304,
+ "grad_norm": 0.45968860387802124,
+ "learning_rate": 4.797990850585782e-06,
+ "loss": 0.5151,
+ "step": 251
+ },
+ {
+ "epoch": 1.188087774294671,
+ "grad_norm": 0.49544984102249146,
+ "learning_rate": 4.79534360943586e-06,
+ "loss": 0.494,
+ "step": 252
+ },
+ {
+ "epoch": 1.1927899686520376,
+ "grad_norm": 0.531892716884613,
+ "learning_rate": 4.792679875577937e-06,
+ "loss": 0.4778,
+ "step": 253
+ },
+ {
+ "epoch": 1.1974921630094044,
+ "grad_norm": 0.5013542175292969,
+ "learning_rate": 4.789999668151714e-06,
+ "loss": 0.5132,
+ "step": 254
+ },
+ {
+ "epoch": 1.2021943573667713,
+ "grad_norm": 0.46963250637054443,
+ "learning_rate": 4.7873030064152545e-06,
+ "loss": 0.4938,
+ "step": 255
+ },
+ {
+ "epoch": 1.206896551724138,
+ "grad_norm": 0.465285986661911,
+ "learning_rate": 4.784589909744856e-06,
+ "loss": 0.4898,
+ "step": 256
+ },
+ {
+ "epoch": 1.2115987460815048,
+ "grad_norm": 0.5183936357498169,
+ "learning_rate": 4.7818603976349005e-06,
+ "loss": 0.5004,
+ "step": 257
+ },
+ {
+ "epoch": 1.2163009404388714,
+ "grad_norm": 0.47324836254119873,
+ "learning_rate": 4.779114489697724e-06,
+ "loss": 0.4972,
+ "step": 258
+ },
+ {
+ "epoch": 1.2210031347962382,
+ "grad_norm": 0.5208264589309692,
+ "learning_rate": 4.776352205663469e-06,
+ "loss": 0.5023,
+ "step": 259
+ },
+ {
+ "epoch": 1.225705329153605,
+ "grad_norm": 0.5583804845809937,
+ "learning_rate": 4.773573565379947e-06,
+ "loss": 0.5099,
+ "step": 260
+ },
+ {
+ "epoch": 1.2304075235109717,
+ "grad_norm": 0.5016160011291504,
+ "learning_rate": 4.770778588812489e-06,
+ "loss": 0.4765,
+ "step": 261
+ },
+ {
+ "epoch": 1.2351097178683386,
+ "grad_norm": 0.50210040807724,
+ "learning_rate": 4.7679672960438135e-06,
+ "loss": 0.5029,
+ "step": 262
+ },
+ {
+ "epoch": 1.2398119122257054,
+ "grad_norm": 0.6636150479316711,
+ "learning_rate": 4.765139707273872e-06,
+ "loss": 0.4909,
+ "step": 263
+ },
+ {
+ "epoch": 1.244514106583072,
+ "grad_norm": 0.4798625111579895,
+ "learning_rate": 4.762295842819707e-06,
+ "loss": 0.5012,
+ "step": 264
+ },
+ {
+ "epoch": 1.249216300940439,
+ "grad_norm": 0.5282374024391174,
+ "learning_rate": 4.759435723115308e-06,
+ "loss": 0.4681,
+ "step": 265
+ },
+ {
+ "epoch": 1.2539184952978055,
+ "grad_norm": 0.5356930494308472,
+ "learning_rate": 4.756559368711463e-06,
+ "loss": 0.506,
+ "step": 266
+ },
+ {
+ "epoch": 1.2586206896551724,
+ "grad_norm": 0.4857093095779419,
+ "learning_rate": 4.75366680027561e-06,
+ "loss": 0.4889,
+ "step": 267
+ },
+ {
+ "epoch": 1.2633228840125392,
+ "grad_norm": 0.484018474817276,
+ "learning_rate": 4.7507580385916906e-06,
+ "loss": 0.4899,
+ "step": 268
+ },
+ {
+ "epoch": 1.2680250783699059,
+ "grad_norm": 0.49720871448516846,
+ "learning_rate": 4.747833104559999e-06,
+ "loss": 0.4654,
+ "step": 269
+ },
+ {
+ "epoch": 1.2727272727272727,
+ "grad_norm": 0.4631911516189575,
+ "learning_rate": 4.744892019197033e-06,
+ "loss": 0.4796,
+ "step": 270
+ },
+ {
+ "epoch": 1.2774294670846396,
+ "grad_norm": 0.5116872787475586,
+ "learning_rate": 4.74193480363534e-06,
+ "loss": 0.4883,
+ "step": 271
+ },
+ {
+ "epoch": 1.2821316614420062,
+ "grad_norm": 0.5275093913078308,
+ "learning_rate": 4.738961479123373e-06,
+ "loss": 0.496,
+ "step": 272
+ },
+ {
+ "epoch": 1.286833855799373,
+ "grad_norm": 0.5001885890960693,
+ "learning_rate": 4.735972067025326e-06,
+ "loss": 0.5012,
+ "step": 273
+ },
+ {
+ "epoch": 1.29153605015674,
+ "grad_norm": 0.5875861048698425,
+ "learning_rate": 4.732966588820991e-06,
+ "loss": 0.4951,
+ "step": 274
+ },
+ {
+ "epoch": 1.2962382445141065,
+ "grad_norm": 0.4893011748790741,
+ "learning_rate": 4.729945066105599e-06,
+ "loss": 0.4742,
+ "step": 275
+ },
+ {
+ "epoch": 1.3009404388714734,
+ "grad_norm": 0.4648543894290924,
+ "learning_rate": 4.726907520589664e-06,
+ "loss": 0.466,
+ "step": 276
+ },
+ {
+ "epoch": 1.3056426332288402,
+ "grad_norm": 0.5300162434577942,
+ "learning_rate": 4.72385397409883e-06,
+ "loss": 0.5072,
+ "step": 277
+ },
+ {
+ "epoch": 1.3103448275862069,
+ "grad_norm": 0.4667080044746399,
+ "learning_rate": 4.720784448573712e-06,
+ "loss": 0.4986,
+ "step": 278
+ },
+ {
+ "epoch": 1.3150470219435737,
+ "grad_norm": 0.5278895497322083,
+ "learning_rate": 4.717698966069739e-06,
+ "loss": 0.5269,
+ "step": 279
+ },
+ {
+ "epoch": 1.3197492163009406,
+ "grad_norm": 0.5325866937637329,
+ "learning_rate": 4.7145975487569965e-06,
+ "loss": 0.5074,
+ "step": 280
+ },
+ {
+ "epoch": 1.3244514106583072,
+ "grad_norm": 0.500861644744873,
+ "learning_rate": 4.711480218920064e-06,
+ "loss": 0.4695,
+ "step": 281
+ },
+ {
+ "epoch": 1.329153605015674,
+ "grad_norm": 0.5263222455978394,
+ "learning_rate": 4.708346998957859e-06,
+ "loss": 0.5173,
+ "step": 282
+ },
+ {
+ "epoch": 1.3338557993730409,
+ "grad_norm": 0.622900128364563,
+ "learning_rate": 4.705197911383473e-06,
+ "loss": 0.4905,
+ "step": 283
+ },
+ {
+ "epoch": 1.3385579937304075,
+ "grad_norm": 0.49273768067359924,
+ "learning_rate": 4.7020329788240115e-06,
+ "loss": 0.4743,
+ "step": 284
+ },
+ {
+ "epoch": 1.3432601880877744,
+ "grad_norm": 0.49558964371681213,
+ "learning_rate": 4.6988522240204325e-06,
+ "loss": 0.4824,
+ "step": 285
+ },
+ {
+ "epoch": 1.347962382445141,
+ "grad_norm": 0.4743976891040802,
+ "learning_rate": 4.695655669827377e-06,
+ "loss": 0.4977,
+ "step": 286
+ },
+ {
+ "epoch": 1.3526645768025078,
+ "grad_norm": 0.49542659521102905,
+ "learning_rate": 4.6924433392130135e-06,
+ "loss": 0.4924,
+ "step": 287
+ },
+ {
+ "epoch": 1.3573667711598745,
+ "grad_norm": 0.7385990619659424,
+ "learning_rate": 4.689215255258866e-06,
+ "loss": 0.5091,
+ "step": 288
+ },
+ {
+ "epoch": 1.3620689655172413,
+ "grad_norm": 0.4826123118400574,
+ "learning_rate": 4.685971441159653e-06,
+ "loss": 0.4791,
+ "step": 289
+ },
+ {
+ "epoch": 1.3667711598746082,
+ "grad_norm": 0.5389033555984497,
+ "learning_rate": 4.682711920223115e-06,
+ "loss": 0.4751,
+ "step": 290
+ },
+ {
+ "epoch": 1.3714733542319748,
+ "grad_norm": 0.5059546232223511,
+ "learning_rate": 4.679436715869856e-06,
+ "loss": 0.499,
+ "step": 291
+ },
+ {
+ "epoch": 1.3761755485893417,
+ "grad_norm": 0.5682849884033203,
+ "learning_rate": 4.676145851633166e-06,
+ "loss": 0.5143,
+ "step": 292
+ },
+ {
+ "epoch": 1.3808777429467085,
+ "grad_norm": 0.4754337668418884,
+ "learning_rate": 4.672839351158856e-06,
+ "loss": 0.4997,
+ "step": 293
+ },
+ {
+ "epoch": 1.3855799373040751,
+ "grad_norm": 0.5227643847465515,
+ "learning_rate": 4.669517238205089e-06,
+ "loss": 0.4834,
+ "step": 294
+ },
+ {
+ "epoch": 1.390282131661442,
+ "grad_norm": 0.4954044222831726,
+ "learning_rate": 4.666179536642208e-06,
+ "loss": 0.483,
+ "step": 295
+ },
+ {
+ "epoch": 1.3949843260188088,
+ "grad_norm": 0.4909021556377411,
+ "learning_rate": 4.662826270452565e-06,
+ "loss": 0.4808,
+ "step": 296
+ },
+ {
+ "epoch": 1.3996865203761755,
+ "grad_norm": 0.4666971266269684,
+ "learning_rate": 4.659457463730347e-06,
+ "loss": 0.488,
+ "step": 297
+ },
+ {
+ "epoch": 1.4043887147335423,
+ "grad_norm": 0.5064187049865723,
+ "learning_rate": 4.6560731406814056e-06,
+ "loss": 0.5046,
+ "step": 298
+ },
+ {
+ "epoch": 1.4090909090909092,
+ "grad_norm": 0.4958318769931793,
+ "learning_rate": 4.65267332562308e-06,
+ "loss": 0.5102,
+ "step": 299
+ },
+ {
+ "epoch": 1.4137931034482758,
+ "grad_norm": 0.5080632567405701,
+ "learning_rate": 4.649258042984026e-06,
+ "loss": 0.5055,
+ "step": 300
+ },
+ {
+ "epoch": 1.4184952978056427,
+ "grad_norm": 0.46236541867256165,
+ "learning_rate": 4.6458273173040395e-06,
+ "loss": 0.4606,
+ "step": 301
+ },
+ {
+ "epoch": 1.4231974921630095,
+ "grad_norm": 1.8524898290634155,
+ "learning_rate": 4.642381173233874e-06,
+ "loss": 0.5002,
+ "step": 302
+ },
+ {
+ "epoch": 1.4278996865203761,
+ "grad_norm": 0.5202615261077881,
+ "learning_rate": 4.638919635535073e-06,
+ "loss": 0.4562,
+ "step": 303
+ },
+ {
+ "epoch": 1.432601880877743,
+ "grad_norm": 0.5293647050857544,
+ "learning_rate": 4.635442729079788e-06,
+ "loss": 0.4806,
+ "step": 304
+ },
+ {
+ "epoch": 1.4373040752351098,
+ "grad_norm": 0.5165356993675232,
+ "learning_rate": 4.6319504788505956e-06,
+ "loss": 0.4775,
+ "step": 305
+ },
+ {
+ "epoch": 1.4420062695924765,
+ "grad_norm": 0.5092841386795044,
+ "learning_rate": 4.628442909940325e-06,
+ "loss": 0.4892,
+ "step": 306
+ },
+ {
+ "epoch": 1.4467084639498433,
+ "grad_norm": 0.511424720287323,
+ "learning_rate": 4.624920047551874e-06,
+ "loss": 0.506,
+ "step": 307
+ },
+ {
+ "epoch": 1.4514106583072102,
+ "grad_norm": 0.5631566643714905,
+ "learning_rate": 4.621381916998029e-06,
+ "loss": 0.4741,
+ "step": 308
+ },
+ {
+ "epoch": 1.4561128526645768,
+ "grad_norm": 0.4748315215110779,
+ "learning_rate": 4.6178285437012806e-06,
+ "loss": 0.5084,
+ "step": 309
+ },
+ {
+ "epoch": 1.4608150470219436,
+ "grad_norm": 0.47158119082450867,
+ "learning_rate": 4.6142599531936435e-06,
+ "loss": 0.4697,
+ "step": 310
+ },
+ {
+ "epoch": 1.4655172413793103,
+ "grad_norm": 0.5358107089996338,
+ "learning_rate": 4.610676171116475e-06,
+ "loss": 0.491,
+ "step": 311
+ },
+ {
+ "epoch": 1.4702194357366771,
+ "grad_norm": 0.47717440128326416,
+ "learning_rate": 4.607077223220286e-06,
+ "loss": 0.4948,
+ "step": 312
+ },
+ {
+ "epoch": 1.4749216300940438,
+ "grad_norm": 0.5041193962097168,
+ "learning_rate": 4.603463135364556e-06,
+ "loss": 0.4648,
+ "step": 313
+ },
+ {
+ "epoch": 1.4796238244514106,
+ "grad_norm": 0.9311274290084839,
+ "learning_rate": 4.5998339335175555e-06,
+ "loss": 0.4866,
+ "step": 314
+ },
+ {
+ "epoch": 1.4843260188087775,
+ "grad_norm": 0.47408604621887207,
+ "learning_rate": 4.596189643756147e-06,
+ "loss": 0.4634,
+ "step": 315
+ },
+ {
+ "epoch": 1.489028213166144,
+ "grad_norm": 0.5052632093429565,
+ "learning_rate": 4.592530292265609e-06,
+ "loss": 0.4843,
+ "step": 316
+ },
+ {
+ "epoch": 1.493730407523511,
+ "grad_norm": 0.5100846886634827,
+ "learning_rate": 4.58885590533944e-06,
+ "loss": 0.4942,
+ "step": 317
+ },
+ {
+ "epoch": 1.4984326018808778,
+ "grad_norm": 0.5132214426994324,
+ "learning_rate": 4.585166509379173e-06,
+ "loss": 0.5135,
+ "step": 318
+ },
+ {
+ "epoch": 1.5031347962382444,
+ "grad_norm": 11.112855911254883,
+ "learning_rate": 4.581462130894186e-06,
+ "loss": 0.4933,
+ "step": 319
+ },
+ {
+ "epoch": 1.5078369905956113,
+ "grad_norm": 0.4873805642127991,
+ "learning_rate": 4.57774279650151e-06,
+ "loss": 0.483,
+ "step": 320
+ },
+ {
+ "epoch": 1.5125391849529781,
+ "grad_norm": 0.5026459693908691,
+ "learning_rate": 4.574008532925638e-06,
+ "loss": 0.5075,
+ "step": 321
+ },
+ {
+ "epoch": 1.5172413793103448,
+ "grad_norm": 0.489947110414505,
+ "learning_rate": 4.570259366998336e-06,
+ "loss": 0.4954,
+ "step": 322
+ },
+ {
+ "epoch": 1.5219435736677116,
+ "grad_norm": 0.48120853304862976,
+ "learning_rate": 4.566495325658445e-06,
+ "loss": 0.5221,
+ "step": 323
+ },
+ {
+ "epoch": 1.5266457680250785,
+ "grad_norm": 0.4880066514015198,
+ "learning_rate": 4.5627164359516915e-06,
+ "loss": 0.5031,
+ "step": 324
+ },
+ {
+ "epoch": 1.531347962382445,
+ "grad_norm": 0.5048410892486572,
+ "learning_rate": 4.558922725030491e-06,
+ "loss": 0.4757,
+ "step": 325
+ },
+ {
+ "epoch": 1.536050156739812,
+ "grad_norm": 0.7033756375312805,
+ "learning_rate": 4.555114220153755e-06,
+ "loss": 0.4285,
+ "step": 326
+ },
+ {
+ "epoch": 1.5407523510971788,
+ "grad_norm": 0.4716516435146332,
+ "learning_rate": 4.551290948686693e-06,
+ "loss": 0.5121,
+ "step": 327
+ },
+ {
+ "epoch": 1.5454545454545454,
+ "grad_norm": 0.4782696068286896,
+ "learning_rate": 4.547452938100615e-06,
+ "loss": 0.5176,
+ "step": 328
+ },
+ {
+ "epoch": 1.5501567398119123,
+ "grad_norm": 0.5119273066520691,
+ "learning_rate": 4.54360021597274e-06,
+ "loss": 0.4941,
+ "step": 329
+ },
+ {
+ "epoch": 1.5548589341692791,
+ "grad_norm": 0.5010069608688354,
+ "learning_rate": 4.539732809985989e-06,
+ "loss": 0.4862,
+ "step": 330
+ },
+ {
+ "epoch": 1.5595611285266457,
+ "grad_norm": 0.5129932165145874,
+ "learning_rate": 4.535850747928796e-06,
+ "loss": 0.4978,
+ "step": 331
+ },
+ {
+ "epoch": 1.5642633228840124,
+ "grad_norm": 0.4957594573497772,
+ "learning_rate": 4.531954057694897e-06,
+ "loss": 0.4814,
+ "step": 332
+ },
+ {
+ "epoch": 1.5689655172413794,
+ "grad_norm": 0.5642824172973633,
+ "learning_rate": 4.5280427672831414e-06,
+ "loss": 0.4888,
+ "step": 333
+ },
+ {
+ "epoch": 1.573667711598746,
+ "grad_norm": 0.4562854468822479,
+ "learning_rate": 4.524116904797281e-06,
+ "loss": 0.4648,
+ "step": 334
+ },
+ {
+ "epoch": 1.5783699059561127,
+ "grad_norm": 0.4849218428134918,
+ "learning_rate": 4.520176498445774e-06,
+ "loss": 0.476,
+ "step": 335
+ },
+ {
+ "epoch": 1.5830721003134798,
+ "grad_norm": 0.5046947002410889,
+ "learning_rate": 4.516221576541581e-06,
+ "loss": 0.4776,
+ "step": 336
+ },
+ {
+ "epoch": 1.5877742946708464,
+ "grad_norm": 0.48211777210235596,
+ "learning_rate": 4.512252167501959e-06,
+ "loss": 0.479,
+ "step": 337
+ },
+ {
+ "epoch": 1.592476489028213,
+ "grad_norm": 0.4812171459197998,
+ "learning_rate": 4.508268299848262e-06,
+ "loss": 0.4849,
+ "step": 338
+ },
+ {
+ "epoch": 1.59717868338558,
+ "grad_norm": 0.5865142345428467,
+ "learning_rate": 4.50427000220573e-06,
+ "loss": 0.499,
+ "step": 339
+ },
+ {
+ "epoch": 1.6018808777429467,
+ "grad_norm": 0.49277785420417786,
+ "learning_rate": 4.50025730330329e-06,
+ "loss": 0.475,
+ "step": 340
+ },
+ {
+ "epoch": 1.6065830721003134,
+ "grad_norm": 0.46771496534347534,
+ "learning_rate": 4.4962302319733445e-06,
+ "loss": 0.494,
+ "step": 341
+ },
+ {
+ "epoch": 1.6112852664576802,
+ "grad_norm": 0.5189441442489624,
+ "learning_rate": 4.492188817151565e-06,
+ "loss": 0.5275,
+ "step": 342
+ },
+ {
+ "epoch": 1.615987460815047,
+ "grad_norm": 0.48845574259757996,
+ "learning_rate": 4.488133087876688e-06,
+ "loss": 0.4676,
+ "step": 343
+ },
+ {
+ "epoch": 1.6206896551724137,
+ "grad_norm": 0.47189632058143616,
+ "learning_rate": 4.484063073290301e-06,
+ "loss": 0.4642,
+ "step": 344
+ },
+ {
+ "epoch": 1.6253918495297806,
+ "grad_norm": 0.5442587733268738,
+ "learning_rate": 4.479978802636637e-06,
+ "loss": 0.4981,
+ "step": 345
+ },
+ {
+ "epoch": 1.6300940438871474,
+ "grad_norm": 0.5048685073852539,
+ "learning_rate": 4.475880305262362e-06,
+ "loss": 0.5037,
+ "step": 346
+ },
+ {
+ "epoch": 1.634796238244514,
+ "grad_norm": 0.4781409800052643,
+ "learning_rate": 4.471767610616366e-06,
+ "loss": 0.4932,
+ "step": 347
+ },
+ {
+ "epoch": 1.6394984326018809,
+ "grad_norm": 0.47388938069343567,
+ "learning_rate": 4.467640748249549e-06,
+ "loss": 0.4687,
+ "step": 348
+ },
+ {
+ "epoch": 1.6442006269592477,
+ "grad_norm": 0.529712438583374,
+ "learning_rate": 4.4634997478146125e-06,
+ "loss": 0.487,
+ "step": 349
+ },
+ {
+ "epoch": 1.6489028213166144,
+ "grad_norm": 0.5114791393280029,
+ "learning_rate": 4.459344639065842e-06,
+ "loss": 0.4809,
+ "step": 350
+ },
+ {
+ "epoch": 1.6536050156739812,
+ "grad_norm": 0.45415258407592773,
+ "learning_rate": 4.455175451858897e-06,
+ "loss": 0.4901,
+ "step": 351
+ },
+ {
+ "epoch": 1.658307210031348,
+ "grad_norm": 0.5842339396476746,
+ "learning_rate": 4.450992216150592e-06,
+ "loss": 0.499,
+ "step": 352
+ },
+ {
+ "epoch": 1.6630094043887147,
+ "grad_norm": 0.48795560002326965,
+ "learning_rate": 4.446794961998689e-06,
+ "loss": 0.4659,
+ "step": 353
+ },
+ {
+ "epoch": 1.6677115987460815,
+ "grad_norm": 0.5531855225563049,
+ "learning_rate": 4.442583719561671e-06,
+ "loss": 0.4923,
+ "step": 354
+ },
+ {
+ "epoch": 1.6724137931034484,
+ "grad_norm": 0.5827644467353821,
+ "learning_rate": 4.438358519098536e-06,
+ "loss": 0.4991,
+ "step": 355
+ },
+ {
+ "epoch": 1.677115987460815,
+ "grad_norm": 0.5260423421859741,
+ "learning_rate": 4.4341193909685685e-06,
+ "loss": 0.4843,
+ "step": 356
+ },
+ {
+ "epoch": 1.6818181818181817,
+ "grad_norm": 0.4969344437122345,
+ "learning_rate": 4.429866365631134e-06,
+ "loss": 0.4915,
+ "step": 357
+ },
+ {
+ "epoch": 1.6865203761755487,
+ "grad_norm": 0.4725005030632019,
+ "learning_rate": 4.425599473645447e-06,
+ "loss": 0.4804,
+ "step": 358
+ },
+ {
+ "epoch": 1.6912225705329154,
+ "grad_norm": 0.47171467542648315,
+ "learning_rate": 4.421318745670364e-06,
+ "loss": 0.4823,
+ "step": 359
+ },
+ {
+ "epoch": 1.695924764890282,
+ "grad_norm": 0.4839799106121063,
+ "learning_rate": 4.4170242124641524e-06,
+ "loss": 0.4585,
+ "step": 360
+ },
+ {
+ "epoch": 1.700626959247649,
+ "grad_norm": 0.4786856472492218,
+ "learning_rate": 4.412715904884277e-06,
+ "loss": 0.49,
+ "step": 361
+ },
+ {
+ "epoch": 1.7053291536050157,
+ "grad_norm": 0.49980080127716064,
+ "learning_rate": 4.4083938538871735e-06,
+ "loss": 0.4675,
+ "step": 362
+ },
+ {
+ "epoch": 1.7100313479623823,
+ "grad_norm": 0.5201369524002075,
+ "learning_rate": 4.4040580905280295e-06,
+ "loss": 0.4862,
+ "step": 363
+ },
+ {
+ "epoch": 1.7147335423197492,
+ "grad_norm": 0.7051575183868408,
+ "learning_rate": 4.3997086459605586e-06,
+ "loss": 0.4822,
+ "step": 364
+ },
+ {
+ "epoch": 1.719435736677116,
+ "grad_norm": 0.48206666111946106,
+ "learning_rate": 4.395345551436779e-06,
+ "loss": 0.5076,
+ "step": 365
+ },
+ {
+ "epoch": 1.7241379310344827,
+ "grad_norm": 0.4817257821559906,
+ "learning_rate": 4.390968838306788e-06,
+ "loss": 0.4623,
+ "step": 366
+ },
+ {
+ "epoch": 1.7288401253918495,
+ "grad_norm": 0.5547840595245361,
+ "learning_rate": 4.386578538018535e-06,
+ "loss": 0.461,
+ "step": 367
+ },
+ {
+ "epoch": 1.7335423197492164,
+ "grad_norm": 0.5085346698760986,
+ "learning_rate": 4.382174682117598e-06,
+ "loss": 0.5068,
+ "step": 368
+ },
+ {
+ "epoch": 1.738244514106583,
+ "grad_norm": 0.4870692193508148,
+ "learning_rate": 4.377757302246956e-06,
+ "loss": 0.4403,
+ "step": 369
+ },
+ {
+ "epoch": 1.7429467084639498,
+ "grad_norm": 0.49482715129852295,
+ "learning_rate": 4.373326430146762e-06,
+ "loss": 0.4986,
+ "step": 370
+ },
+ {
+ "epoch": 1.7476489028213167,
+ "grad_norm": 0.5474854707717896,
+ "learning_rate": 4.368882097654113e-06,
+ "loss": 0.4938,
+ "step": 371
+ },
+ {
+ "epoch": 1.7523510971786833,
+ "grad_norm": 0.5055244565010071,
+ "learning_rate": 4.364424336702825e-06,
+ "loss": 0.4711,
+ "step": 372
+ },
+ {
+ "epoch": 1.7570532915360502,
+ "grad_norm": 0.48241329193115234,
+ "learning_rate": 4.3599531793232e-06,
+ "loss": 0.4856,
+ "step": 373
+ },
+ {
+ "epoch": 1.761755485893417,
+ "grad_norm": 0.4932602047920227,
+ "learning_rate": 4.355468657641797e-06,
+ "loss": 0.4818,
+ "step": 374
+ },
+ {
+ "epoch": 1.7664576802507836,
+ "grad_norm": 0.5512160658836365,
+ "learning_rate": 4.3509708038812035e-06,
+ "loss": 0.4864,
+ "step": 375
+ },
+ {
+ "epoch": 1.7711598746081505,
+ "grad_norm": 0.47026327252388,
+ "learning_rate": 4.346459650359798e-06,
+ "loss": 0.4825,
+ "step": 376
+ },
+ {
+ "epoch": 1.7758620689655173,
+ "grad_norm": 0.4831086993217468,
+ "learning_rate": 4.341935229491525e-06,
+ "loss": 0.4541,
+ "step": 377
+ },
+ {
+ "epoch": 1.780564263322884,
+ "grad_norm": 0.5045217871665955,
+ "learning_rate": 4.337397573785659e-06,
+ "loss": 0.5025,
+ "step": 378
+ },
+ {
+ "epoch": 1.7852664576802508,
+ "grad_norm": 0.5657753348350525,
+ "learning_rate": 4.332846715846566e-06,
+ "loss": 0.4698,
+ "step": 379
+ },
+ {
+ "epoch": 1.7899686520376177,
+ "grad_norm": 0.49546748399734497,
+ "learning_rate": 4.328282688373479e-06,
+ "loss": 0.4911,
+ "step": 380
+ },
+ {
+ "epoch": 1.7946708463949843,
+ "grad_norm": 0.5037291049957275,
+ "learning_rate": 4.323705524160258e-06,
+ "loss": 0.4877,
+ "step": 381
+ },
+ {
+ "epoch": 1.799373040752351,
+ "grad_norm": 0.5256901383399963,
+ "learning_rate": 4.319115256095149e-06,
+ "loss": 0.4662,
+ "step": 382
+ },
+ {
+ "epoch": 1.804075235109718,
+ "grad_norm": 0.4890702962875366,
+ "learning_rate": 4.314511917160557e-06,
+ "loss": 0.4683,
+ "step": 383
+ },
+ {
+ "epoch": 1.8087774294670846,
+ "grad_norm": 0.4724109470844269,
+ "learning_rate": 4.3098955404328045e-06,
+ "loss": 0.4602,
+ "step": 384
+ },
+ {
+ "epoch": 1.8134796238244513,
+ "grad_norm": 0.4933278560638428,
+ "learning_rate": 4.305266159081895e-06,
+ "loss": 0.4806,
+ "step": 385
+ },
+ {
+ "epoch": 1.8181818181818183,
+ "grad_norm": 0.5068219304084778,
+ "learning_rate": 4.3006238063712725e-06,
+ "loss": 0.4647,
+ "step": 386
+ },
+ {
+ "epoch": 1.822884012539185,
+ "grad_norm": 0.5293509364128113,
+ "learning_rate": 4.295968515657583e-06,
+ "loss": 0.4998,
+ "step": 387
+ },
+ {
+ "epoch": 1.8275862068965516,
+ "grad_norm": 0.4775199294090271,
+ "learning_rate": 4.29130032039044e-06,
+ "loss": 0.4821,
+ "step": 388
+ },
+ {
+ "epoch": 1.8322884012539185,
+ "grad_norm": 0.4914006292819977,
+ "learning_rate": 4.2866192541121755e-06,
+ "loss": 0.4735,
+ "step": 389
+ },
+ {
+ "epoch": 1.8369905956112853,
+ "grad_norm": 0.5009908080101013,
+ "learning_rate": 4.281925350457606e-06,
+ "loss": 0.4741,
+ "step": 390
+ },
+ {
+ "epoch": 1.841692789968652,
+ "grad_norm": 0.47211164236068726,
+ "learning_rate": 4.277218643153787e-06,
+ "loss": 0.4786,
+ "step": 391
+ },
+ {
+ "epoch": 1.8463949843260188,
+ "grad_norm": 1.9644113779067993,
+ "learning_rate": 4.272499166019771e-06,
+ "loss": 0.4759,
+ "step": 392
+ },
+ {
+ "epoch": 1.8510971786833856,
+ "grad_norm": 0.535971999168396,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.4665,
+ "step": 393
+ },
+ {
+ "epoch": 1.8557993730407523,
+ "grad_norm": 0.4666787385940552,
+ "learning_rate": 4.2630220379959006e-06,
+ "loss": 0.4417,
+ "step": 394
+ },
+ {
+ "epoch": 1.8605015673981191,
+ "grad_norm": 0.5976264476776123,
+ "learning_rate": 4.258264455201953e-06,
+ "loss": 0.4665,
+ "step": 395
+ },
+ {
+ "epoch": 1.865203761755486,
+ "grad_norm": 0.4814331531524658,
+ "learning_rate": 4.2534942387691335e-06,
+ "loss": 0.4896,
+ "step": 396
+ },
+ {
+ "epoch": 1.8699059561128526,
+ "grad_norm": 0.4929859936237335,
+ "learning_rate": 4.248711422972829e-06,
+ "loss": 0.4765,
+ "step": 397
+ },
+ {
+ "epoch": 1.8746081504702194,
+ "grad_norm": 0.517914354801178,
+ "learning_rate": 4.243916042178954e-06,
+ "loss": 0.4601,
+ "step": 398
+ },
+ {
+ "epoch": 1.8793103448275863,
+ "grad_norm": 0.47731271386146545,
+ "learning_rate": 4.239108130843709e-06,
+ "loss": 0.469,
+ "step": 399
+ },
+ {
+ "epoch": 1.884012539184953,
+ "grad_norm": 0.4939954876899719,
+ "learning_rate": 4.234287723513326e-06,
+ "loss": 0.4929,
+ "step": 400
+ },
+ {
+ "epoch": 1.8887147335423198,
+ "grad_norm": 0.48573923110961914,
+ "learning_rate": 4.229454854823827e-06,
+ "loss": 0.4913,
+ "step": 401
+ },
+ {
+ "epoch": 1.8934169278996866,
+ "grad_norm": 0.5146409273147583,
+ "learning_rate": 4.224609559500772e-06,
+ "loss": 0.502,
+ "step": 402
+ },
+ {
+ "epoch": 1.8981191222570533,
+ "grad_norm": 0.4884675443172455,
+ "learning_rate": 4.21975187235901e-06,
+ "loss": 0.4541,
+ "step": 403
+ },
+ {
+ "epoch": 1.90282131661442,
+ "grad_norm": 0.4871810972690582,
+ "learning_rate": 4.21488182830243e-06,
+ "loss": 0.4811,
+ "step": 404
+ },
+ {
+ "epoch": 1.907523510971787,
+ "grad_norm": 0.5089552402496338,
+ "learning_rate": 4.209999462323706e-06,
+ "loss": 0.4584,
+ "step": 405
+ },
+ {
+ "epoch": 1.9122257053291536,
+ "grad_norm": 0.6191231608390808,
+ "learning_rate": 4.20510480950405e-06,
+ "loss": 0.4885,
+ "step": 406
+ },
+ {
+ "epoch": 1.9169278996865202,
+ "grad_norm": 0.5512096285820007,
+ "learning_rate": 4.200197905012961e-06,
+ "loss": 0.4529,
+ "step": 407
+ },
+ {
+ "epoch": 1.9216300940438873,
+ "grad_norm": 0.4743112027645111,
+ "learning_rate": 4.195278784107965e-06,
+ "loss": 0.4702,
+ "step": 408
+ },
+ {
+ "epoch": 1.926332288401254,
+ "grad_norm": 0.4635118544101715,
+ "learning_rate": 4.19034748213437e-06,
+ "loss": 0.4718,
+ "step": 409
+ },
+ {
+ "epoch": 1.9310344827586206,
+ "grad_norm": 0.48715919256210327,
+ "learning_rate": 4.185404034525008e-06,
+ "loss": 0.4638,
+ "step": 410
+ },
+ {
+ "epoch": 1.9357366771159876,
+ "grad_norm": 0.5373724102973938,
+ "learning_rate": 4.180448476799981e-06,
+ "loss": 0.5009,
+ "step": 411
+ },
+ {
+ "epoch": 1.9404388714733543,
+ "grad_norm": 0.4978715479373932,
+ "learning_rate": 4.175480844566404e-06,
+ "loss": 0.4726,
+ "step": 412
+ },
+ {
+ "epoch": 1.9451410658307209,
+ "grad_norm": 0.44817060232162476,
+ "learning_rate": 4.170501173518152e-06,
+ "loss": 0.4683,
+ "step": 413
+ },
+ {
+ "epoch": 1.9498432601880877,
+ "grad_norm": 0.48472973704338074,
+ "learning_rate": 4.165509499435604e-06,
+ "loss": 0.4662,
+ "step": 414
+ },
+ {
+ "epoch": 1.9545454545454546,
+ "grad_norm": 0.6567174792289734,
+ "learning_rate": 4.16050585818538e-06,
+ "loss": 0.4801,
+ "step": 415
+ },
+ {
+ "epoch": 1.9592476489028212,
+ "grad_norm": 0.5131425857543945,
+ "learning_rate": 4.155490285720092e-06,
+ "loss": 0.5036,
+ "step": 416
+ },
+ {
+ "epoch": 1.963949843260188,
+ "grad_norm": 0.46051982045173645,
+ "learning_rate": 4.150462818078079e-06,
+ "loss": 0.4911,
+ "step": 417
+ },
+ {
+ "epoch": 1.968652037617555,
+ "grad_norm": 0.5288883447647095,
+ "learning_rate": 4.145423491383153e-06,
+ "loss": 0.4871,
+ "step": 418
+ },
+ {
+ "epoch": 1.9733542319749215,
+ "grad_norm": 0.5143817663192749,
+ "learning_rate": 4.14037234184433e-06,
+ "loss": 0.5027,
+ "step": 419
+ },
+ {
+ "epoch": 1.9780564263322884,
+ "grad_norm": 0.46323707699775696,
+ "learning_rate": 4.135309405755583e-06,
+ "loss": 0.4876,
+ "step": 420
+ },
+ {
+ "epoch": 1.9827586206896552,
+ "grad_norm": 0.5239706039428711,
+ "learning_rate": 4.130234719495574e-06,
+ "loss": 0.4702,
+ "step": 421
+ },
+ {
+ "epoch": 1.9874608150470219,
+ "grad_norm": 0.538753867149353,
+ "learning_rate": 4.125148319527391e-06,
+ "loss": 0.4638,
+ "step": 422
+ },
+ {
+ "epoch": 1.9921630094043887,
+ "grad_norm": 0.5180181860923767,
+ "learning_rate": 4.1200502423982904e-06,
+ "loss": 0.4841,
+ "step": 423
+ },
+ {
+ "epoch": 1.9968652037617556,
+ "grad_norm": 0.6698167324066162,
+ "learning_rate": 4.1149405247394295e-06,
+ "loss": 0.4882,
+ "step": 424
+ },
+ {
+ "epoch": 2.0047021943573666,
+ "grad_norm": 0.9728522896766663,
+ "learning_rate": 4.10981920326561e-06,
+ "loss": 0.9125,
+ "step": 425
+ },
+ {
+ "epoch": 2.0094043887147337,
+ "grad_norm": 0.7356107831001282,
+ "learning_rate": 4.104686314775009e-06,
+ "loss": 0.4422,
+ "step": 426
+ },
+ {
+ "epoch": 2.0141065830721003,
+ "grad_norm": 0.44414228200912476,
+ "learning_rate": 4.099541896148914e-06,
+ "loss": 0.4511,
+ "step": 427
+ },
+ {
+ "epoch": 2.018808777429467,
+ "grad_norm": 0.5738011002540588,
+ "learning_rate": 4.094385984351462e-06,
+ "loss": 0.4457,
+ "step": 428
+ },
+ {
+ "epoch": 2.023510971786834,
+ "grad_norm": 0.4643106460571289,
+ "learning_rate": 4.0892186164293715e-06,
+ "loss": 0.4644,
+ "step": 429
+ },
+ {
+ "epoch": 2.0282131661442007,
+ "grad_norm": 0.5355309247970581,
+ "learning_rate": 4.0840398295116745e-06,
+ "loss": 0.4535,
+ "step": 430
+ },
+ {
+ "epoch": 2.0329153605015673,
+ "grad_norm": 0.512458324432373,
+ "learning_rate": 4.078849660809456e-06,
+ "loss": 0.4481,
+ "step": 431
+ },
+ {
+ "epoch": 2.0376175548589344,
+ "grad_norm": 0.5055253505706787,
+ "learning_rate": 4.073648147615579e-06,
+ "loss": 0.4309,
+ "step": 432
+ },
+ {
+ "epoch": 2.042319749216301,
+ "grad_norm": 0.5128353834152222,
+ "learning_rate": 4.068435327304421e-06,
+ "loss": 0.4562,
+ "step": 433
+ },
+ {
+ "epoch": 2.0470219435736676,
+ "grad_norm": 0.4432103633880615,
+ "learning_rate": 4.063211237331603e-06,
+ "loss": 0.4535,
+ "step": 434
+ },
+ {
+ "epoch": 2.0517241379310347,
+ "grad_norm": 0.5092498660087585,
+ "learning_rate": 4.057975915233725e-06,
+ "loss": 0.4385,
+ "step": 435
+ },
+ {
+ "epoch": 2.0564263322884013,
+ "grad_norm": 0.4798133671283722,
+ "learning_rate": 4.052729398628089e-06,
+ "loss": 0.466,
+ "step": 436
+ },
+ {
+ "epoch": 2.061128526645768,
+ "grad_norm": 0.5094019770622253,
+ "learning_rate": 4.047471725212437e-06,
+ "loss": 0.4624,
+ "step": 437
+ },
+ {
+ "epoch": 2.0658307210031346,
+ "grad_norm": 0.5814178586006165,
+ "learning_rate": 4.042202932764673e-06,
+ "loss": 0.4472,
+ "step": 438
+ },
+ {
+ "epoch": 2.0705329153605017,
+ "grad_norm": 0.503394365310669,
+ "learning_rate": 4.036923059142595e-06,
+ "loss": 0.4481,
+ "step": 439
+ },
+ {
+ "epoch": 2.0752351097178683,
+ "grad_norm": 0.5108861923217773,
+ "learning_rate": 4.031632142283623e-06,
+ "loss": 0.4416,
+ "step": 440
+ },
+ {
+ "epoch": 2.079937304075235,
+ "grad_norm": 0.5303971171379089,
+ "learning_rate": 4.026330220204524e-06,
+ "loss": 0.4515,
+ "step": 441
+ },
+ {
+ "epoch": 2.084639498432602,
+ "grad_norm": 0.45014286041259766,
+ "learning_rate": 4.021017331001146e-06,
+ "loss": 0.441,
+ "step": 442
+ },
+ {
+ "epoch": 2.0893416927899686,
+ "grad_norm": 0.5371219515800476,
+ "learning_rate": 4.015693512848131e-06,
+ "loss": 0.4471,
+ "step": 443
+ },
+ {
+ "epoch": 2.0940438871473352,
+ "grad_norm": 0.5105510354042053,
+ "learning_rate": 4.0103588039986556e-06,
+ "loss": 0.4534,
+ "step": 444
+ },
+ {
+ "epoch": 2.0987460815047023,
+ "grad_norm": 0.4960611164569855,
+ "learning_rate": 4.005013242784146e-06,
+ "loss": 0.46,
+ "step": 445
+ },
+ {
+ "epoch": 2.103448275862069,
+ "grad_norm": 0.500354528427124,
+ "learning_rate": 3.999656867614006e-06,
+ "loss": 0.45,
+ "step": 446
+ },
+ {
+ "epoch": 2.1081504702194356,
+ "grad_norm": 0.4733876585960388,
+ "learning_rate": 3.994289716975341e-06,
+ "loss": 0.4644,
+ "step": 447
+ },
+ {
+ "epoch": 2.1128526645768027,
+ "grad_norm": 0.5002915263175964,
+ "learning_rate": 3.988911829432682e-06,
+ "loss": 0.4493,
+ "step": 448
+ },
+ {
+ "epoch": 2.1175548589341693,
+ "grad_norm": 0.48520293831825256,
+ "learning_rate": 3.983523243627706e-06,
+ "loss": 0.4458,
+ "step": 449
+ },
+ {
+ "epoch": 2.122257053291536,
+ "grad_norm": 0.6339934468269348,
+ "learning_rate": 3.978123998278962e-06,
+ "loss": 0.4352,
+ "step": 450
+ },
+ {
+ "epoch": 2.126959247648903,
+ "grad_norm": 1.172338843345642,
+ "learning_rate": 3.97271413218159e-06,
+ "loss": 0.4664,
+ "step": 451
+ },
+ {
+ "epoch": 2.1316614420062696,
+ "grad_norm": 0.47842296957969666,
+ "learning_rate": 3.9672936842070425e-06,
+ "loss": 0.4604,
+ "step": 452
+ },
+ {
+ "epoch": 2.1363636363636362,
+ "grad_norm": 0.506851077079773,
+ "learning_rate": 3.9618626933028086e-06,
+ "loss": 0.4674,
+ "step": 453
+ },
+ {
+ "epoch": 2.1410658307210033,
+ "grad_norm": 0.4922677278518677,
+ "learning_rate": 3.956421198492128e-06,
+ "loss": 0.4476,
+ "step": 454
+ },
+ {
+ "epoch": 2.14576802507837,
+ "grad_norm": 0.5307339429855347,
+ "learning_rate": 3.950969238873714e-06,
+ "loss": 0.4463,
+ "step": 455
+ },
+ {
+ "epoch": 2.1504702194357366,
+ "grad_norm": 0.5131121873855591,
+ "learning_rate": 3.9455068536214765e-06,
+ "loss": 0.4779,
+ "step": 456
+ },
+ {
+ "epoch": 2.1551724137931036,
+ "grad_norm": 0.5438089966773987,
+ "learning_rate": 3.9400340819842335e-06,
+ "loss": 0.4563,
+ "step": 457
+ },
+ {
+ "epoch": 2.1598746081504703,
+ "grad_norm": 0.7426711916923523,
+ "learning_rate": 3.934550963285432e-06,
+ "loss": 0.4561,
+ "step": 458
+ },
+ {
+ "epoch": 2.164576802507837,
+ "grad_norm": 0.482920378446579,
+ "learning_rate": 3.9290575369228664e-06,
+ "loss": 0.4293,
+ "step": 459
+ },
+ {
+ "epoch": 2.169278996865204,
+ "grad_norm": 0.6583715081214905,
+ "learning_rate": 3.923553842368396e-06,
+ "loss": 0.4682,
+ "step": 460
+ },
+ {
+ "epoch": 2.1739811912225706,
+ "grad_norm": 0.47901806235313416,
+ "learning_rate": 3.918039919167658e-06,
+ "loss": 0.4342,
+ "step": 461
+ },
+ {
+ "epoch": 2.1786833855799372,
+ "grad_norm": 0.4929746389389038,
+ "learning_rate": 3.912515806939786e-06,
+ "loss": 0.4478,
+ "step": 462
+ },
+ {
+ "epoch": 2.183385579937304,
+ "grad_norm": 0.48205333948135376,
+ "learning_rate": 3.906981545377124e-06,
+ "loss": 0.4595,
+ "step": 463
+ },
+ {
+ "epoch": 2.188087774294671,
+ "grad_norm": 0.5059337019920349,
+ "learning_rate": 3.901437174244943e-06,
+ "loss": 0.4294,
+ "step": 464
+ },
+ {
+ "epoch": 2.1927899686520376,
+ "grad_norm": 0.4752981662750244,
+ "learning_rate": 3.895882733381154e-06,
+ "loss": 0.448,
+ "step": 465
+ },
+ {
+ "epoch": 2.197492163009404,
+ "grad_norm": 0.5249196290969849,
+ "learning_rate": 3.890318262696023e-06,
+ "loss": 0.4655,
+ "step": 466
+ },
+ {
+ "epoch": 2.2021943573667713,
+ "grad_norm": 0.48044726252555847,
+ "learning_rate": 3.8847438021718805e-06,
+ "loss": 0.4413,
+ "step": 467
+ },
+ {
+ "epoch": 2.206896551724138,
+ "grad_norm": 0.84516841173172,
+ "learning_rate": 3.879159391862839e-06,
+ "loss": 0.4645,
+ "step": 468
+ },
+ {
+ "epoch": 2.2115987460815045,
+ "grad_norm": 0.5334392786026001,
+ "learning_rate": 3.873565071894503e-06,
+ "loss": 0.4347,
+ "step": 469
+ },
+ {
+ "epoch": 2.2163009404388716,
+ "grad_norm": 0.5113687515258789,
+ "learning_rate": 3.86796088246368e-06,
+ "loss": 0.4314,
+ "step": 470
+ },
+ {
+ "epoch": 2.2210031347962382,
+ "grad_norm": 0.5226101279258728,
+ "learning_rate": 3.8623468638380905e-06,
+ "loss": 0.418,
+ "step": 471
+ },
+ {
+ "epoch": 2.225705329153605,
+ "grad_norm": 0.4901522099971771,
+ "learning_rate": 3.856723056356085e-06,
+ "loss": 0.4597,
+ "step": 472
+ },
+ {
+ "epoch": 2.230407523510972,
+ "grad_norm": 0.5312012434005737,
+ "learning_rate": 3.851089500426346e-06,
+ "loss": 0.4444,
+ "step": 473
+ },
+ {
+ "epoch": 2.2351097178683386,
+ "grad_norm": 0.5347906351089478,
+ "learning_rate": 3.845446236527605e-06,
+ "loss": 0.4447,
+ "step": 474
+ },
+ {
+ "epoch": 2.239811912225705,
+ "grad_norm": 0.4781494438648224,
+ "learning_rate": 3.8397933052083445e-06,
+ "loss": 0.462,
+ "step": 475
+ },
+ {
+ "epoch": 2.2445141065830723,
+ "grad_norm": 0.5215012431144714,
+ "learning_rate": 3.834130747086512e-06,
+ "loss": 0.4475,
+ "step": 476
+ },
+ {
+ "epoch": 2.249216300940439,
+ "grad_norm": 0.5048666000366211,
+ "learning_rate": 3.828458602849226e-06,
+ "loss": 0.4483,
+ "step": 477
+ },
+ {
+ "epoch": 2.2539184952978055,
+ "grad_norm": 0.5508173108100891,
+ "learning_rate": 3.822776913252485e-06,
+ "loss": 0.4511,
+ "step": 478
+ },
+ {
+ "epoch": 2.2586206896551726,
+ "grad_norm": 0.5031043887138367,
+ "learning_rate": 3.817085719120872e-06,
+ "loss": 0.4019,
+ "step": 479
+ },
+ {
+ "epoch": 2.2633228840125392,
+ "grad_norm": 0.508939802646637,
+ "learning_rate": 3.811385061347263e-06,
+ "loss": 0.4461,
+ "step": 480
+ },
+ {
+ "epoch": 2.268025078369906,
+ "grad_norm": 0.5605170726776123,
+ "learning_rate": 3.805674980892535e-06,
+ "loss": 0.4695,
+ "step": 481
+ },
+ {
+ "epoch": 2.2727272727272725,
+ "grad_norm": 0.5526806712150574,
+ "learning_rate": 3.7999555187852667e-06,
+ "loss": 0.4575,
+ "step": 482
+ },
+ {
+ "epoch": 2.2774294670846396,
+ "grad_norm": 0.47659724950790405,
+ "learning_rate": 3.7942267161214497e-06,
+ "loss": 0.4433,
+ "step": 483
+ },
+ {
+ "epoch": 2.282131661442006,
+ "grad_norm": 0.49713975191116333,
+ "learning_rate": 3.7884886140641884e-06,
+ "loss": 0.4692,
+ "step": 484
+ },
+ {
+ "epoch": 2.2868338557993733,
+ "grad_norm": 0.48685988783836365,
+ "learning_rate": 3.7827412538434062e-06,
+ "loss": 0.4328,
+ "step": 485
+ },
+ {
+ "epoch": 2.29153605015674,
+ "grad_norm": 0.5074832439422607,
+ "learning_rate": 3.7769846767555495e-06,
+ "loss": 0.4598,
+ "step": 486
+ },
+ {
+ "epoch": 2.2962382445141065,
+ "grad_norm": 0.5333994030952454,
+ "learning_rate": 3.7712189241632898e-06,
+ "loss": 0.4554,
+ "step": 487
+ },
+ {
+ "epoch": 2.300940438871473,
+ "grad_norm": 0.49985551834106445,
+ "learning_rate": 3.7654440374952288e-06,
+ "loss": 0.4421,
+ "step": 488
+ },
+ {
+ "epoch": 2.30564263322884,
+ "grad_norm": 0.4791257679462433,
+ "learning_rate": 3.7596600582455976e-06,
+ "loss": 0.4187,
+ "step": 489
+ },
+ {
+ "epoch": 2.310344827586207,
+ "grad_norm": 0.4951220154762268,
+ "learning_rate": 3.75386702797396e-06,
+ "loss": 0.4205,
+ "step": 490
+ },
+ {
+ "epoch": 2.3150470219435735,
+ "grad_norm": 0.4765990674495697,
+ "learning_rate": 3.7480649883049164e-06,
+ "loss": 0.4251,
+ "step": 491
+ },
+ {
+ "epoch": 2.3197492163009406,
+ "grad_norm": 0.5125405192375183,
+ "learning_rate": 3.7422539809277993e-06,
+ "loss": 0.4361,
+ "step": 492
+ },
+ {
+ "epoch": 2.324451410658307,
+ "grad_norm": 0.5286112427711487,
+ "learning_rate": 3.736434047596379e-06,
+ "loss": 0.4423,
+ "step": 493
+ },
+ {
+ "epoch": 2.329153605015674,
+ "grad_norm": 0.47961002588272095,
+ "learning_rate": 3.73060523012856e-06,
+ "loss": 0.453,
+ "step": 494
+ },
+ {
+ "epoch": 2.333855799373041,
+ "grad_norm": 0.5857998728752136,
+ "learning_rate": 3.724767570406082e-06,
+ "loss": 0.4674,
+ "step": 495
+ },
+ {
+ "epoch": 2.3385579937304075,
+ "grad_norm": 0.5348326563835144,
+ "learning_rate": 3.7189211103742206e-06,
+ "loss": 0.4267,
+ "step": 496
+ },
+ {
+ "epoch": 2.343260188087774,
+ "grad_norm": 0.4718475937843323,
+ "learning_rate": 3.7130658920414818e-06,
+ "loss": 0.4619,
+ "step": 497
+ },
+ {
+ "epoch": 2.347962382445141,
+ "grad_norm": 0.44225215911865234,
+ "learning_rate": 3.7072019574793034e-06,
+ "loss": 0.4712,
+ "step": 498
+ },
+ {
+ "epoch": 2.352664576802508,
+ "grad_norm": 0.48492008447647095,
+ "learning_rate": 3.701329348821752e-06,
+ "loss": 0.4521,
+ "step": 499
+ },
+ {
+ "epoch": 2.3573667711598745,
+ "grad_norm": 0.49741214513778687,
+ "learning_rate": 3.695448108265221e-06,
+ "loss": 0.4378,
+ "step": 500
+ },
+ {
+ "epoch": 2.3620689655172415,
+ "grad_norm": 0.5086454749107361,
+ "learning_rate": 3.6895582780681254e-06,
+ "loss": 0.4349,
+ "step": 501
+ },
+ {
+ "epoch": 2.366771159874608,
+ "grad_norm": 0.49111631512641907,
+ "learning_rate": 3.683659900550598e-06,
+ "loss": 0.4625,
+ "step": 502
+ },
+ {
+ "epoch": 2.371473354231975,
+ "grad_norm": 0.5006322264671326,
+ "learning_rate": 3.6777530180941894e-06,
+ "loss": 0.4457,
+ "step": 503
+ },
+ {
+ "epoch": 2.376175548589342,
+ "grad_norm": 0.5934097170829773,
+ "learning_rate": 3.671837673141559e-06,
+ "loss": 0.4306,
+ "step": 504
+ },
+ {
+ "epoch": 2.3808777429467085,
+ "grad_norm": 0.626039981842041,
+ "learning_rate": 3.6659139081961707e-06,
+ "loss": 0.4464,
+ "step": 505
+ },
+ {
+ "epoch": 2.385579937304075,
+ "grad_norm": 0.4751131236553192,
+ "learning_rate": 3.6599817658219916e-06,
+ "loss": 0.4508,
+ "step": 506
+ },
+ {
+ "epoch": 2.3902821316614418,
+ "grad_norm": 1.4542276859283447,
+ "learning_rate": 3.6540412886431796e-06,
+ "loss": 0.4606,
+ "step": 507
+ },
+ {
+ "epoch": 2.394984326018809,
+ "grad_norm": 0.5189768075942993,
+ "learning_rate": 3.648092519343783e-06,
+ "loss": 0.4435,
+ "step": 508
+ },
+ {
+ "epoch": 2.3996865203761755,
+ "grad_norm": 1.4583938121795654,
+ "learning_rate": 3.642135500667431e-06,
+ "loss": 0.4314,
+ "step": 509
+ },
+ {
+ "epoch": 2.4043887147335425,
+ "grad_norm": 0.5038107633590698,
+ "learning_rate": 3.6361702754170247e-06,
+ "loss": 0.4463,
+ "step": 510
+ },
+ {
+ "epoch": 2.409090909090909,
+ "grad_norm": 0.5786447525024414,
+ "learning_rate": 3.630196886454435e-06,
+ "loss": 0.4281,
+ "step": 511
+ },
+ {
+ "epoch": 2.413793103448276,
+ "grad_norm": 0.48684218525886536,
+ "learning_rate": 3.62421537670019e-06,
+ "loss": 0.4432,
+ "step": 512
+ },
+ {
+ "epoch": 2.4184952978056424,
+ "grad_norm": 0.5117013454437256,
+ "learning_rate": 3.618225789133167e-06,
+ "loss": 0.4464,
+ "step": 513
+ },
+ {
+ "epoch": 2.4231974921630095,
+ "grad_norm": 0.49249181151390076,
+ "learning_rate": 3.612228166790287e-06,
+ "loss": 0.4465,
+ "step": 514
+ },
+ {
+ "epoch": 2.427899686520376,
+ "grad_norm": 0.5761134624481201,
+ "learning_rate": 3.606222552766201e-06,
+ "loss": 0.4539,
+ "step": 515
+ },
+ {
+ "epoch": 2.4326018808777428,
+ "grad_norm": 0.4839339256286621,
+ "learning_rate": 3.6002089902129844e-06,
+ "loss": 0.4469,
+ "step": 516
+ },
+ {
+ "epoch": 2.43730407523511,
+ "grad_norm": 0.4765976369380951,
+ "learning_rate": 3.5941875223398225e-06,
+ "loss": 0.4379,
+ "step": 517
+ },
+ {
+ "epoch": 2.4420062695924765,
+ "grad_norm": 0.5239338874816895,
+ "learning_rate": 3.588158192412707e-06,
+ "loss": 0.4354,
+ "step": 518
+ },
+ {
+ "epoch": 2.446708463949843,
+ "grad_norm": 0.48244595527648926,
+ "learning_rate": 3.582121043754116e-06,
+ "loss": 0.438,
+ "step": 519
+ },
+ {
+ "epoch": 2.45141065830721,
+ "grad_norm": 0.4641244411468506,
+ "learning_rate": 3.5760761197427097e-06,
+ "loss": 0.438,
+ "step": 520
+ },
+ {
+ "epoch": 2.456112852664577,
+ "grad_norm": 0.48468074202537537,
+ "learning_rate": 3.570023463813017e-06,
+ "loss": 0.4306,
+ "step": 521
+ },
+ {
+ "epoch": 2.4608150470219434,
+ "grad_norm": 0.48626402020454407,
+ "learning_rate": 3.5639631194551216e-06,
+ "loss": 0.4531,
+ "step": 522
+ },
+ {
+ "epoch": 2.4655172413793105,
+ "grad_norm": 0.5581764578819275,
+ "learning_rate": 3.557895130214352e-06,
+ "loss": 0.4451,
+ "step": 523
+ },
+ {
+ "epoch": 2.470219435736677,
+ "grad_norm": 0.6739279627799988,
+ "learning_rate": 3.5518195396909653e-06,
+ "loss": 0.4636,
+ "step": 524
+ },
+ {
+ "epoch": 2.4749216300940438,
+ "grad_norm": 0.550710916519165,
+ "learning_rate": 3.5457363915398384e-06,
+ "loss": 0.4513,
+ "step": 525
+ },
+ {
+ "epoch": 2.479623824451411,
+ "grad_norm": 0.479632705450058,
+ "learning_rate": 3.539645729470151e-06,
+ "loss": 0.4387,
+ "step": 526
+ },
+ {
+ "epoch": 2.4843260188087775,
+ "grad_norm": 0.48741331696510315,
+ "learning_rate": 3.5335475972450715e-06,
+ "loss": 0.4388,
+ "step": 527
+ },
+ {
+ "epoch": 2.489028213166144,
+ "grad_norm": 0.4964964985847473,
+ "learning_rate": 3.5274420386814458e-06,
+ "loss": 0.4643,
+ "step": 528
+ },
+ {
+ "epoch": 2.493730407523511,
+ "grad_norm": 0.5134934186935425,
+ "learning_rate": 3.521329097649478e-06,
+ "loss": 0.4454,
+ "step": 529
+ },
+ {
+ "epoch": 2.498432601880878,
+ "grad_norm": 0.4962058961391449,
+ "learning_rate": 3.515208818072418e-06,
+ "loss": 0.4408,
+ "step": 530
+ },
+ {
+ "epoch": 2.5031347962382444,
+ "grad_norm": 0.5611489415168762,
+ "learning_rate": 3.509081243926247e-06,
+ "loss": 0.4306,
+ "step": 531
+ },
+ {
+ "epoch": 2.507836990595611,
+ "grad_norm": 0.7012472748756409,
+ "learning_rate": 3.5029464192393557e-06,
+ "loss": 0.4614,
+ "step": 532
+ },
+ {
+ "epoch": 2.512539184952978,
+ "grad_norm": 0.5351004004478455,
+ "learning_rate": 3.4968043880922363e-06,
+ "loss": 0.4151,
+ "step": 533
+ },
+ {
+ "epoch": 2.5172413793103448,
+ "grad_norm": 0.5087808966636658,
+ "learning_rate": 3.4906551946171603e-06,
+ "loss": 0.4242,
+ "step": 534
+ },
+ {
+ "epoch": 2.521943573667712,
+ "grad_norm": 0.5459093451499939,
+ "learning_rate": 3.484498882997861e-06,
+ "loss": 0.4215,
+ "step": 535
+ },
+ {
+ "epoch": 2.5266457680250785,
+ "grad_norm": 0.49804285168647766,
+ "learning_rate": 3.478335497469219e-06,
+ "loss": 0.4492,
+ "step": 536
+ },
+ {
+ "epoch": 2.531347962382445,
+ "grad_norm": 0.4959704875946045,
+ "learning_rate": 3.472165082316943e-06,
+ "loss": 0.4511,
+ "step": 537
+ },
+ {
+ "epoch": 2.5360501567398117,
+ "grad_norm": 0.5059382319450378,
+ "learning_rate": 3.465987681877251e-06,
+ "loss": 0.4419,
+ "step": 538
+ },
+ {
+ "epoch": 2.540752351097179,
+ "grad_norm": 0.7398380637168884,
+ "learning_rate": 3.4598033405365527e-06,
+ "loss": 0.4548,
+ "step": 539
+ },
+ {
+ "epoch": 2.5454545454545454,
+ "grad_norm": 0.5326687693595886,
+ "learning_rate": 3.45361210273113e-06,
+ "loss": 0.4473,
+ "step": 540
+ },
+ {
+ "epoch": 2.5501567398119125,
+ "grad_norm": 0.5069761872291565,
+ "learning_rate": 3.447414012946818e-06,
+ "loss": 0.4343,
+ "step": 541
+ },
+ {
+ "epoch": 2.554858934169279,
+ "grad_norm": 0.45915964245796204,
+ "learning_rate": 3.4412091157186853e-06,
+ "loss": 0.4499,
+ "step": 542
+ },
+ {
+ "epoch": 2.5595611285266457,
+ "grad_norm": 0.5174360275268555,
+ "learning_rate": 3.4349974556307146e-06,
+ "loss": 0.44,
+ "step": 543
+ },
+ {
+ "epoch": 2.5642633228840124,
+ "grad_norm": 0.5008105039596558,
+ "learning_rate": 3.4287790773154807e-06,
+ "loss": 0.4648,
+ "step": 544
+ },
+ {
+ "epoch": 2.5689655172413794,
+ "grad_norm": 0.5628801584243774,
+ "learning_rate": 3.4225540254538297e-06,
+ "loss": 0.462,
+ "step": 545
+ },
+ {
+ "epoch": 2.573667711598746,
+ "grad_norm": 0.9913654923439026,
+ "learning_rate": 3.416322344774562e-06,
+ "loss": 0.4403,
+ "step": 546
+ },
+ {
+ "epoch": 2.5783699059561127,
+ "grad_norm": 0.5034172534942627,
+ "learning_rate": 3.4100840800541055e-06,
+ "loss": 0.4622,
+ "step": 547
+ },
+ {
+ "epoch": 2.58307210031348,
+ "grad_norm": 0.495516836643219,
+ "learning_rate": 3.4038392761161986e-06,
+ "loss": 0.4523,
+ "step": 548
+ },
+ {
+ "epoch": 2.5877742946708464,
+ "grad_norm": 0.48142367601394653,
+ "learning_rate": 3.3975879778315634e-06,
+ "loss": 0.4242,
+ "step": 549
+ },
+ {
+ "epoch": 2.592476489028213,
+ "grad_norm": 0.4635900557041168,
+ "learning_rate": 3.391330230117587e-06,
+ "loss": 0.3949,
+ "step": 550
+ },
+ {
+ "epoch": 2.5971786833855797,
+ "grad_norm": 0.4769044816493988,
+ "learning_rate": 3.385066077937997e-06,
+ "loss": 0.4651,
+ "step": 551
+ },
+ {
+ "epoch": 2.6018808777429467,
+ "grad_norm": 1.059553861618042,
+ "learning_rate": 3.378795566302541e-06,
+ "loss": 0.4243,
+ "step": 552
+ },
+ {
+ "epoch": 2.6065830721003134,
+ "grad_norm": 0.512134850025177,
+ "learning_rate": 3.372518740266658e-06,
+ "loss": 0.4435,
+ "step": 553
+ },
+ {
+ "epoch": 2.6112852664576804,
+ "grad_norm": 0.5267173647880554,
+ "learning_rate": 3.36623564493116e-06,
+ "loss": 0.4558,
+ "step": 554
+ },
+ {
+ "epoch": 2.615987460815047,
+ "grad_norm": 0.49343907833099365,
+ "learning_rate": 3.3599463254419047e-06,
+ "loss": 0.4598,
+ "step": 555
+ },
+ {
+ "epoch": 2.6206896551724137,
+ "grad_norm": 0.5496839284896851,
+ "learning_rate": 3.3536508269894724e-06,
+ "loss": 0.4669,
+ "step": 556
+ },
+ {
+ "epoch": 2.6253918495297803,
+ "grad_norm": 0.5957831740379333,
+ "learning_rate": 3.347349194808842e-06,
+ "loss": 0.4533,
+ "step": 557
+ },
+ {
+ "epoch": 2.6300940438871474,
+ "grad_norm": 0.5049230456352234,
+ "learning_rate": 3.3410414741790625e-06,
+ "loss": 0.4293,
+ "step": 558
+ },
+ {
+ "epoch": 2.634796238244514,
+ "grad_norm": 0.5167728066444397,
+ "learning_rate": 3.3347277104229332e-06,
+ "loss": 0.443,
+ "step": 559
+ },
+ {
+ "epoch": 2.639498432601881,
+ "grad_norm": 0.6090758442878723,
+ "learning_rate": 3.3284079489066728e-06,
+ "loss": 0.4378,
+ "step": 560
+ },
+ {
+ "epoch": 2.6442006269592477,
+ "grad_norm": 0.5165027379989624,
+ "learning_rate": 3.3220822350395966e-06,
+ "loss": 0.4302,
+ "step": 561
+ },
+ {
+ "epoch": 2.6489028213166144,
+ "grad_norm": 0.5152680277824402,
+ "learning_rate": 3.31575061427379e-06,
+ "loss": 0.4311,
+ "step": 562
+ },
+ {
+ "epoch": 2.653605015673981,
+ "grad_norm": 0.547235906124115,
+ "learning_rate": 3.3094131321037783e-06,
+ "loss": 0.4371,
+ "step": 563
+ },
+ {
+ "epoch": 2.658307210031348,
+ "grad_norm": 0.521981418132782,
+ "learning_rate": 3.303069834066206e-06,
+ "loss": 0.4346,
+ "step": 564
+ },
+ {
+ "epoch": 2.6630094043887147,
+ "grad_norm": 0.5127217769622803,
+ "learning_rate": 3.2967207657395055e-06,
+ "loss": 0.474,
+ "step": 565
+ },
+ {
+ "epoch": 2.6677115987460818,
+ "grad_norm": 0.5210872888565063,
+ "learning_rate": 3.2903659727435692e-06,
+ "loss": 0.4622,
+ "step": 566
+ },
+ {
+ "epoch": 2.6724137931034484,
+ "grad_norm": 0.5768873691558838,
+ "learning_rate": 3.284005500739423e-06,
+ "loss": 0.4556,
+ "step": 567
+ },
+ {
+ "epoch": 2.677115987460815,
+ "grad_norm": 0.5305764675140381,
+ "learning_rate": 3.2776393954289e-06,
+ "loss": 0.429,
+ "step": 568
+ },
+ {
+ "epoch": 2.6818181818181817,
+ "grad_norm": 0.5312129855155945,
+ "learning_rate": 3.271267702554307e-06,
+ "loss": 0.4208,
+ "step": 569
+ },
+ {
+ "epoch": 2.6865203761755487,
+ "grad_norm": 0.5433884859085083,
+ "learning_rate": 3.2648904678981032e-06,
+ "loss": 0.4647,
+ "step": 570
+ },
+ {
+ "epoch": 2.6912225705329154,
+ "grad_norm": 1.2331725358963013,
+ "learning_rate": 3.2585077372825636e-06,
+ "loss": 0.4126,
+ "step": 571
+ },
+ {
+ "epoch": 2.695924764890282,
+ "grad_norm": 0.5495198369026184,
+ "learning_rate": 3.2521195565694543e-06,
+ "loss": 0.4453,
+ "step": 572
+ },
+ {
+ "epoch": 2.700626959247649,
+ "grad_norm": 0.5230907201766968,
+ "learning_rate": 3.2457259716597023e-06,
+ "loss": 0.446,
+ "step": 573
+ },
+ {
+ "epoch": 2.7053291536050157,
+ "grad_norm": 0.4807503819465637,
+ "learning_rate": 3.2393270284930658e-06,
+ "loss": 0.4547,
+ "step": 574
+ },
+ {
+ "epoch": 2.7100313479623823,
+ "grad_norm": 0.5169614553451538,
+ "learning_rate": 3.2329227730478026e-06,
+ "loss": 0.4319,
+ "step": 575
+ },
+ {
+ "epoch": 2.714733542319749,
+ "grad_norm": 0.502966046333313,
+ "learning_rate": 3.2265132513403415e-06,
+ "loss": 0.4196,
+ "step": 576
+ },
+ {
+ "epoch": 2.719435736677116,
+ "grad_norm": 0.5387672781944275,
+ "learning_rate": 3.22009850942495e-06,
+ "loss": 0.4449,
+ "step": 577
+ },
+ {
+ "epoch": 2.7241379310344827,
+ "grad_norm": 0.5503709316253662,
+ "learning_rate": 3.213678593393405e-06,
+ "loss": 0.4589,
+ "step": 578
+ },
+ {
+ "epoch": 2.7288401253918497,
+ "grad_norm": 0.5165039300918579,
+ "learning_rate": 3.207253549374662e-06,
+ "loss": 0.4578,
+ "step": 579
+ },
+ {
+ "epoch": 2.7335423197492164,
+ "grad_norm": 0.5894023180007935,
+ "learning_rate": 3.200823423534519e-06,
+ "loss": 0.4448,
+ "step": 580
+ },
+ {
+ "epoch": 2.738244514106583,
+ "grad_norm": 0.5234156250953674,
+ "learning_rate": 3.194388262075293e-06,
+ "loss": 0.4504,
+ "step": 581
+ },
+ {
+ "epoch": 2.7429467084639496,
+ "grad_norm": 0.47498077154159546,
+ "learning_rate": 3.1879481112354804e-06,
+ "loss": 0.4471,
+ "step": 582
+ },
+ {
+ "epoch": 2.7476489028213167,
+ "grad_norm": 0.5213322043418884,
+ "learning_rate": 3.181503017289428e-06,
+ "loss": 0.4096,
+ "step": 583
+ },
+ {
+ "epoch": 2.7523510971786833,
+ "grad_norm": 0.5031464695930481,
+ "learning_rate": 3.175053026547002e-06,
+ "loss": 0.416,
+ "step": 584
+ },
+ {
+ "epoch": 2.7570532915360504,
+ "grad_norm": 0.7983574867248535,
+ "learning_rate": 3.16859818535325e-06,
+ "loss": 0.457,
+ "step": 585
+ },
+ {
+ "epoch": 2.761755485893417,
+ "grad_norm": 0.47774994373321533,
+ "learning_rate": 3.1621385400880756e-06,
+ "loss": 0.4529,
+ "step": 586
+ },
+ {
+ "epoch": 2.7664576802507836,
+ "grad_norm": 0.8216882348060608,
+ "learning_rate": 3.1556741371658984e-06,
+ "loss": 0.4559,
+ "step": 587
+ },
+ {
+ "epoch": 2.7711598746081503,
+ "grad_norm": 0.5124049186706543,
+ "learning_rate": 3.1492050230353238e-06,
+ "loss": 0.4438,
+ "step": 588
+ },
+ {
+ "epoch": 2.7758620689655173,
+ "grad_norm": 0.5410915017127991,
+ "learning_rate": 3.142731244178809e-06,
+ "loss": 0.4195,
+ "step": 589
+ },
+ {
+ "epoch": 2.780564263322884,
+ "grad_norm": 0.5318175554275513,
+ "learning_rate": 3.1362528471123277e-06,
+ "loss": 0.4046,
+ "step": 590
+ },
+ {
+ "epoch": 2.785266457680251,
+ "grad_norm": 0.6133676171302795,
+ "learning_rate": 3.129769878385039e-06,
+ "loss": 0.4098,
+ "step": 591
+ },
+ {
+ "epoch": 2.7899686520376177,
+ "grad_norm": 0.4698888063430786,
+ "learning_rate": 3.1232823845789473e-06,
+ "loss": 0.4508,
+ "step": 592
+ },
+ {
+ "epoch": 2.7946708463949843,
+ "grad_norm": 0.6980767250061035,
+ "learning_rate": 3.1167904123085736e-06,
+ "loss": 0.455,
+ "step": 593
+ },
+ {
+ "epoch": 2.799373040752351,
+ "grad_norm": 0.5151284337043762,
+ "learning_rate": 3.110294008220617e-06,
+ "loss": 0.4431,
+ "step": 594
+ },
+ {
+ "epoch": 2.804075235109718,
+ "grad_norm": 0.47901320457458496,
+ "learning_rate": 3.1037932189936205e-06,
+ "loss": 0.4406,
+ "step": 595
+ },
+ {
+ "epoch": 2.8087774294670846,
+ "grad_norm": 0.5079891085624695,
+ "learning_rate": 3.097288091337635e-06,
+ "loss": 0.4351,
+ "step": 596
+ },
+ {
+ "epoch": 2.8134796238244513,
+ "grad_norm": 0.5278874635696411,
+ "learning_rate": 3.0907786719938876e-06,
+ "loss": 0.4264,
+ "step": 597
+ },
+ {
+ "epoch": 2.8181818181818183,
+ "grad_norm": 0.47123396396636963,
+ "learning_rate": 3.084265007734436e-06,
+ "loss": 0.434,
+ "step": 598
+ },
+ {
+ "epoch": 2.822884012539185,
+ "grad_norm": 0.5229635834693909,
+ "learning_rate": 3.0777471453618457e-06,
+ "loss": 0.4602,
+ "step": 599
+ },
+ {
+ "epoch": 2.8275862068965516,
+ "grad_norm": 0.47847074270248413,
+ "learning_rate": 3.0712251317088426e-06,
+ "loss": 0.4317,
+ "step": 600
+ },
+ {
+ "epoch": 2.8322884012539182,
+ "grad_norm": 0.7754543423652649,
+ "learning_rate": 3.064699013637983e-06,
+ "loss": 0.4528,
+ "step": 601
+ },
+ {
+ "epoch": 2.8369905956112853,
+ "grad_norm": 0.5581084489822388,
+ "learning_rate": 3.0581688380413115e-06,
+ "loss": 0.4369,
+ "step": 602
+ },
+ {
+ "epoch": 2.841692789968652,
+ "grad_norm": 0.588622510433197,
+ "learning_rate": 3.0516346518400315e-06,
+ "loss": 0.4517,
+ "step": 603
+ },
+ {
+ "epoch": 2.846394984326019,
+ "grad_norm": 0.565423846244812,
+ "learning_rate": 3.0450965019841593e-06,
+ "loss": 0.4517,
+ "step": 604
+ },
+ {
+ "epoch": 2.8510971786833856,
+ "grad_norm": 0.47801777720451355,
+ "learning_rate": 3.0385544354521957e-06,
+ "loss": 0.4161,
+ "step": 605
+ },
+ {
+ "epoch": 2.8557993730407523,
+ "grad_norm": 0.5034862756729126,
+ "learning_rate": 3.0320084992507814e-06,
+ "loss": 0.4428,
+ "step": 606
+ },
+ {
+ "epoch": 2.860501567398119,
+ "grad_norm": 0.5339663624763489,
+ "learning_rate": 3.0254587404143604e-06,
+ "loss": 0.4792,
+ "step": 607
+ },
+ {
+ "epoch": 2.865203761755486,
+ "grad_norm": 0.48184943199157715,
+ "learning_rate": 3.0189052060048464e-06,
+ "loss": 0.4409,
+ "step": 608
+ },
+ {
+ "epoch": 2.8699059561128526,
+ "grad_norm": 0.5102176070213318,
+ "learning_rate": 3.01234794311128e-06,
+ "loss": 0.438,
+ "step": 609
+ },
+ {
+ "epoch": 2.8746081504702197,
+ "grad_norm": 0.5111781358718872,
+ "learning_rate": 3.0057869988494925e-06,
+ "loss": 0.4617,
+ "step": 610
+ },
+ {
+ "epoch": 2.8793103448275863,
+ "grad_norm": 0.5915101766586304,
+ "learning_rate": 2.999222420361767e-06,
+ "loss": 0.4532,
+ "step": 611
+ },
+ {
+ "epoch": 2.884012539184953,
+ "grad_norm": 0.48898932337760925,
+ "learning_rate": 2.9926542548165e-06,
+ "loss": 0.4663,
+ "step": 612
+ },
+ {
+ "epoch": 2.8887147335423196,
+ "grad_norm": 0.4943861961364746,
+ "learning_rate": 2.9860825494078605e-06,
+ "loss": 0.4354,
+ "step": 613
+ },
+ {
+ "epoch": 2.8934169278996866,
+ "grad_norm": 0.5398025512695312,
+ "learning_rate": 2.979507351355454e-06,
+ "loss": 0.4546,
+ "step": 614
+ },
+ {
+ "epoch": 2.8981191222570533,
+ "grad_norm": 0.545421302318573,
+ "learning_rate": 2.972928707903981e-06,
+ "loss": 0.4404,
+ "step": 615
+ },
+ {
+ "epoch": 2.9028213166144203,
+ "grad_norm": 0.5370550751686096,
+ "learning_rate": 2.966346666322898e-06,
+ "loss": 0.4401,
+ "step": 616
+ },
+ {
+ "epoch": 2.907523510971787,
+ "grad_norm": 0.5280672311782837,
+ "learning_rate": 2.9597612739060775e-06,
+ "loss": 0.4172,
+ "step": 617
+ },
+ {
+ "epoch": 2.9122257053291536,
+ "grad_norm": 0.5043423175811768,
+ "learning_rate": 2.9531725779714713e-06,
+ "loss": 0.4487,
+ "step": 618
+ },
+ {
+ "epoch": 2.91692789968652,
+ "grad_norm": 1.961200475692749,
+ "learning_rate": 2.9465806258607653e-06,
+ "loss": 0.4548,
+ "step": 619
+ },
+ {
+ "epoch": 2.9216300940438873,
+ "grad_norm": 0.5286726355552673,
+ "learning_rate": 2.939985464939043e-06,
+ "loss": 0.4566,
+ "step": 620
+ },
+ {
+ "epoch": 2.926332288401254,
+ "grad_norm": 0.5209453105926514,
+ "learning_rate": 2.9333871425944434e-06,
+ "loss": 0.4064,
+ "step": 621
+ },
+ {
+ "epoch": 2.9310344827586206,
+ "grad_norm": 0.47711747884750366,
+ "learning_rate": 2.926785706237822e-06,
+ "loss": 0.4341,
+ "step": 622
+ },
+ {
+ "epoch": 2.9357366771159876,
+ "grad_norm": 0.45926427841186523,
+ "learning_rate": 2.920181203302409e-06,
+ "loss": 0.4256,
+ "step": 623
+ },
+ {
+ "epoch": 2.9404388714733543,
+ "grad_norm": 0.5624600648880005,
+ "learning_rate": 2.91357368124347e-06,
+ "loss": 0.4252,
+ "step": 624
+ },
+ {
+ "epoch": 2.945141065830721,
+ "grad_norm": 0.5101850628852844,
+ "learning_rate": 2.906963187537962e-06,
+ "loss": 0.4352,
+ "step": 625
+ },
+ {
+ "epoch": 2.9498432601880875,
+ "grad_norm": 0.5341358184814453,
+ "learning_rate": 2.9003497696841955e-06,
+ "loss": 0.4132,
+ "step": 626
+ },
+ {
+ "epoch": 2.9545454545454546,
+ "grad_norm": 0.5917084217071533,
+ "learning_rate": 2.8937334752014913e-06,
+ "loss": 0.4693,
+ "step": 627
+ },
+ {
+ "epoch": 2.959247648902821,
+ "grad_norm": 0.793695330619812,
+ "learning_rate": 2.887114351629839e-06,
+ "loss": 0.4431,
+ "step": 628
+ },
+ {
+ "epoch": 2.9639498432601883,
+ "grad_norm": 0.5363728404045105,
+ "learning_rate": 2.8804924465295575e-06,
+ "loss": 0.4672,
+ "step": 629
+ },
+ {
+ "epoch": 2.968652037617555,
+ "grad_norm": 0.4979572892189026,
+ "learning_rate": 2.873867807480951e-06,
+ "loss": 0.4723,
+ "step": 630
+ },
+ {
+ "epoch": 2.9733542319749215,
+ "grad_norm": 0.5310130715370178,
+ "learning_rate": 2.8672404820839676e-06,
+ "loss": 0.4388,
+ "step": 631
+ },
+ {
+ "epoch": 2.978056426332288,
+ "grad_norm": 0.530015766620636,
+ "learning_rate": 2.8606105179578584e-06,
+ "loss": 0.4466,
+ "step": 632
+ },
+ {
+ "epoch": 2.9827586206896552,
+ "grad_norm": 0.5356627702713013,
+ "learning_rate": 2.8539779627408332e-06,
+ "loss": 0.4252,
+ "step": 633
+ },
+ {
+ "epoch": 2.987460815047022,
+ "grad_norm": 0.5290245413780212,
+ "learning_rate": 2.847342864089721e-06,
+ "loss": 0.4453,
+ "step": 634
+ },
+ {
+ "epoch": 2.992163009404389,
+ "grad_norm": 0.471682071685791,
+ "learning_rate": 2.8407052696796255e-06,
+ "loss": 0.43,
+ "step": 635
+ },
+ {
+ "epoch": 2.9968652037617556,
+ "grad_norm": 0.5220829844474792,
+ "learning_rate": 2.834065227203584e-06,
+ "loss": 0.4494,
+ "step": 636
+ },
+ {
+ "epoch": 3.0047021943573666,
+ "grad_norm": 0.4797399342060089,
+ "learning_rate": 2.8274227843722213e-06,
+ "loss": 0.8683,
+ "step": 637
+ },
+ {
+ "epoch": 3.0094043887147337,
+ "grad_norm": 0.5463248491287231,
+ "learning_rate": 2.820777988913412e-06,
+ "loss": 0.4157,
+ "step": 638
+ },
+ {
+ "epoch": 3.0141065830721003,
+ "grad_norm": 0.5081924200057983,
+ "learning_rate": 2.8141308885719337e-06,
+ "loss": 0.4169,
+ "step": 639
+ },
+ {
+ "epoch": 3.018808777429467,
+ "grad_norm": 0.4916677474975586,
+ "learning_rate": 2.8074815311091265e-06,
+ "loss": 0.3898,
+ "step": 640
+ },
+ {
+ "epoch": 3.023510971786834,
+ "grad_norm": 0.48858827352523804,
+ "learning_rate": 2.8008299643025477e-06,
+ "loss": 0.4319,
+ "step": 641
+ },
+ {
+ "epoch": 3.0282131661442007,
+ "grad_norm": 0.49183058738708496,
+ "learning_rate": 2.7941762359456294e-06,
+ "loss": 0.4243,
+ "step": 642
+ },
+ {
+ "epoch": 3.0329153605015673,
+ "grad_norm": 0.5068245530128479,
+ "learning_rate": 2.787520393847334e-06,
+ "loss": 0.4168,
+ "step": 643
+ },
+ {
+ "epoch": 3.0376175548589344,
+ "grad_norm": 0.542245090007782,
+ "learning_rate": 2.780862485831814e-06,
+ "loss": 0.4289,
+ "step": 644
+ },
+ {
+ "epoch": 3.042319749216301,
+ "grad_norm": 0.49114999175071716,
+ "learning_rate": 2.7742025597380644e-06,
+ "loss": 0.4337,
+ "step": 645
+ },
+ {
+ "epoch": 3.0470219435736676,
+ "grad_norm": 0.4982999563217163,
+ "learning_rate": 2.7675406634195824e-06,
+ "loss": 0.4207,
+ "step": 646
+ },
+ {
+ "epoch": 3.0517241379310347,
+ "grad_norm": 0.5352709293365479,
+ "learning_rate": 2.7608768447440193e-06,
+ "loss": 0.4087,
+ "step": 647
+ },
+ {
+ "epoch": 3.0564263322884013,
+ "grad_norm": 0.5486279726028442,
+ "learning_rate": 2.754211151592841e-06,
+ "loss": 0.4129,
+ "step": 648
+ },
+ {
+ "epoch": 3.061128526645768,
+ "grad_norm": 0.6048034429550171,
+ "learning_rate": 2.7475436318609827e-06,
+ "loss": 0.433,
+ "step": 649
+ },
+ {
+ "epoch": 3.0658307210031346,
+ "grad_norm": 0.6576470136642456,
+ "learning_rate": 2.7408743334565006e-06,
+ "loss": 0.4086,
+ "step": 650
+ },
+ {
+ "epoch": 3.0705329153605017,
+ "grad_norm": 0.49989938735961914,
+ "learning_rate": 2.734203304300235e-06,
+ "loss": 0.3999,
+ "step": 651
+ },
+ {
+ "epoch": 3.0752351097178683,
+ "grad_norm": 0.5238141417503357,
+ "learning_rate": 2.7275305923254607e-06,
+ "loss": 0.4133,
+ "step": 652
+ },
+ {
+ "epoch": 3.079937304075235,
+ "grad_norm": 0.5244804620742798,
+ "learning_rate": 2.720856245477544e-06,
+ "loss": 0.4016,
+ "step": 653
+ },
+ {
+ "epoch": 3.084639498432602,
+ "grad_norm": 0.5036159753799438,
+ "learning_rate": 2.7141803117135978e-06,
+ "loss": 0.3972,
+ "step": 654
+ },
+ {
+ "epoch": 3.0893416927899686,
+ "grad_norm": 0.5390443801879883,
+ "learning_rate": 2.7075028390021385e-06,
+ "loss": 0.3992,
+ "step": 655
+ },
+ {
+ "epoch": 3.0940438871473352,
+ "grad_norm": 0.5226757526397705,
+ "learning_rate": 2.7008238753227385e-06,
+ "loss": 0.4074,
+ "step": 656
+ },
+ {
+ "epoch": 3.0987460815047023,
+ "grad_norm": 0.48386913537979126,
+ "learning_rate": 2.694143468665685e-06,
+ "loss": 0.4284,
+ "step": 657
+ },
+ {
+ "epoch": 3.103448275862069,
+ "grad_norm": 0.5081993341445923,
+ "learning_rate": 2.6874616670316338e-06,
+ "loss": 0.3952,
+ "step": 658
+ },
+ {
+ "epoch": 3.1081504702194356,
+ "grad_norm": 0.538280189037323,
+ "learning_rate": 2.6807785184312618e-06,
+ "loss": 0.4136,
+ "step": 659
+ },
+ {
+ "epoch": 3.1128526645768027,
+ "grad_norm": 0.7804566621780396,
+ "learning_rate": 2.674094070884926e-06,
+ "loss": 0.4131,
+ "step": 660
+ },
+ {
+ "epoch": 3.1175548589341693,
+ "grad_norm": 0.6693199872970581,
+ "learning_rate": 2.6674083724223166e-06,
+ "loss": 0.4329,
+ "step": 661
+ },
+ {
+ "epoch": 3.122257053291536,
+ "grad_norm": 0.5034769773483276,
+ "learning_rate": 2.6607214710821112e-06,
+ "loss": 0.4062,
+ "step": 662
+ },
+ {
+ "epoch": 3.126959247648903,
+ "grad_norm": 0.5518231391906738,
+ "learning_rate": 2.6540334149116304e-06,
+ "loss": 0.4172,
+ "step": 663
+ },
+ {
+ "epoch": 3.1316614420062696,
+ "grad_norm": 0.5797336101531982,
+ "learning_rate": 2.647344251966493e-06,
+ "loss": 0.4164,
+ "step": 664
+ },
+ {
+ "epoch": 3.1363636363636362,
+ "grad_norm": 0.5404736399650574,
+ "learning_rate": 2.6406540303102714e-06,
+ "loss": 0.4157,
+ "step": 665
+ },
+ {
+ "epoch": 3.1410658307210033,
+ "grad_norm": 0.5246729850769043,
+ "learning_rate": 2.6339627980141425e-06,
+ "loss": 0.4165,
+ "step": 666
+ },
+ {
+ "epoch": 3.14576802507837,
+ "grad_norm": 0.5443553328514099,
+ "learning_rate": 2.6272706031565482e-06,
+ "loss": 0.4022,
+ "step": 667
+ },
+ {
+ "epoch": 3.1504702194357366,
+ "grad_norm": 0.5127459168434143,
+ "learning_rate": 2.6205774938228433e-06,
+ "loss": 0.3983,
+ "step": 668
+ },
+ {
+ "epoch": 3.1551724137931036,
+ "grad_norm": 0.5095480680465698,
+ "learning_rate": 2.6138835181049556e-06,
+ "loss": 0.4227,
+ "step": 669
+ },
+ {
+ "epoch": 3.1598746081504703,
+ "grad_norm": 0.5238015651702881,
+ "learning_rate": 2.6071887241010374e-06,
+ "loss": 0.4056,
+ "step": 670
+ },
+ {
+ "epoch": 3.164576802507837,
+ "grad_norm": 0.5659390687942505,
+ "learning_rate": 2.6004931599151223e-06,
+ "loss": 0.3933,
+ "step": 671
+ },
+ {
+ "epoch": 3.169278996865204,
+ "grad_norm": 0.528191328048706,
+ "learning_rate": 2.593796873656775e-06,
+ "loss": 0.4356,
+ "step": 672
+ },
+ {
+ "epoch": 3.1739811912225706,
+ "grad_norm": 1.1774086952209473,
+ "learning_rate": 2.587099913440749e-06,
+ "loss": 0.4149,
+ "step": 673
+ },
+ {
+ "epoch": 3.1786833855799372,
+ "grad_norm": 0.5629571676254272,
+ "learning_rate": 2.580402327386643e-06,
+ "loss": 0.403,
+ "step": 674
+ },
+ {
+ "epoch": 3.183385579937304,
+ "grad_norm": 1.1260513067245483,
+ "learning_rate": 2.5737041636185496e-06,
+ "loss": 0.4102,
+ "step": 675
+ },
+ {
+ "epoch": 3.188087774294671,
+ "grad_norm": 0.6467511653900146,
+ "learning_rate": 2.5670054702647146e-06,
+ "loss": 0.3948,
+ "step": 676
+ },
+ {
+ "epoch": 3.1927899686520376,
+ "grad_norm": 0.5177720785140991,
+ "learning_rate": 2.5603062954571872e-06,
+ "loss": 0.4188,
+ "step": 677
+ },
+ {
+ "epoch": 3.197492163009404,
+ "grad_norm": 0.5086417198181152,
+ "learning_rate": 2.553606687331477e-06,
+ "loss": 0.4403,
+ "step": 678
+ },
+ {
+ "epoch": 3.2021943573667713,
+ "grad_norm": 0.5762012600898743,
+ "learning_rate": 2.5469066940262073e-06,
+ "loss": 0.4084,
+ "step": 679
+ },
+ {
+ "epoch": 3.206896551724138,
+ "grad_norm": 0.5122736692428589,
+ "learning_rate": 2.540206363682768e-06,
+ "loss": 0.4005,
+ "step": 680
+ },
+ {
+ "epoch": 3.2115987460815045,
+ "grad_norm": 0.5179394483566284,
+ "learning_rate": 2.533505744444972e-06,
+ "loss": 0.419,
+ "step": 681
+ },
+ {
+ "epoch": 3.2163009404388716,
+ "grad_norm": 0.5541443824768066,
+ "learning_rate": 2.526804884458707e-06,
+ "loss": 0.4112,
+ "step": 682
+ },
+ {
+ "epoch": 3.2210031347962382,
+ "grad_norm": 0.5687317252159119,
+ "learning_rate": 2.520103831871591e-06,
+ "loss": 0.4145,
+ "step": 683
+ },
+ {
+ "epoch": 3.225705329153605,
+ "grad_norm": 0.5060294270515442,
+ "learning_rate": 2.513402634832627e-06,
+ "loss": 0.3933,
+ "step": 684
+ },
+ {
+ "epoch": 3.230407523510972,
+ "grad_norm": 0.6311008930206299,
+ "learning_rate": 2.5067013414918523e-06,
+ "loss": 0.401,
+ "step": 685
+ },
+ {
+ "epoch": 3.2351097178683386,
+ "grad_norm": 0.5575832724571228,
+ "learning_rate": 2.5e-06,
+ "loss": 0.4127,
+ "step": 686
+ },
+ {
+ "epoch": 3.239811912225705,
+ "grad_norm": 0.5105507373809814,
+ "learning_rate": 2.493298658508149e-06,
+ "loss": 0.3971,
+ "step": 687
+ },
+ {
+ "epoch": 3.2445141065830723,
+ "grad_norm": 0.5813129544258118,
+ "learning_rate": 2.4865973651673743e-06,
+ "loss": 0.4136,
+ "step": 688
+ },
+ {
+ "epoch": 3.249216300940439,
+ "grad_norm": 0.5921242833137512,
+ "learning_rate": 2.4798961681284096e-06,
+ "loss": 0.437,
+ "step": 689
+ },
+ {
+ "epoch": 3.2539184952978055,
+ "grad_norm": 0.5654864311218262,
+ "learning_rate": 2.473195115541293e-06,
+ "loss": 0.3939,
+ "step": 690
+ },
+ {
+ "epoch": 3.2586206896551726,
+ "grad_norm": 0.5103882551193237,
+ "learning_rate": 2.466494255555029e-06,
+ "loss": 0.4394,
+ "step": 691
+ },
+ {
+ "epoch": 3.2633228840125392,
+ "grad_norm": 0.5423967242240906,
+ "learning_rate": 2.459793636317233e-06,
+ "loss": 0.4048,
+ "step": 692
+ },
+ {
+ "epoch": 3.268025078369906,
+ "grad_norm": 0.6185951828956604,
+ "learning_rate": 2.4530933059737936e-06,
+ "loss": 0.4432,
+ "step": 693
+ },
+ {
+ "epoch": 3.2727272727272725,
+ "grad_norm": 0.6062753796577454,
+ "learning_rate": 2.4463933126685236e-06,
+ "loss": 0.4061,
+ "step": 694
+ },
+ {
+ "epoch": 3.2774294670846396,
+ "grad_norm": 0.5118281841278076,
+ "learning_rate": 2.439693704542814e-06,
+ "loss": 0.4008,
+ "step": 695
+ },
+ {
+ "epoch": 3.282131661442006,
+ "grad_norm": 0.9080231785774231,
+ "learning_rate": 2.432994529735286e-06,
+ "loss": 0.409,
+ "step": 696
+ },
+ {
+ "epoch": 3.2868338557993733,
+ "grad_norm": 0.550635814666748,
+ "learning_rate": 2.4262958363814512e-06,
+ "loss": 0.4202,
+ "step": 697
+ },
+ {
+ "epoch": 3.29153605015674,
+ "grad_norm": 0.5728116631507874,
+ "learning_rate": 2.4195976726133574e-06,
+ "loss": 0.406,
+ "step": 698
+ },
+ {
+ "epoch": 3.2962382445141065,
+ "grad_norm": 0.4995472729206085,
+ "learning_rate": 2.4129000865592517e-06,
+ "loss": 0.4063,
+ "step": 699
+ },
+ {
+ "epoch": 3.300940438871473,
+ "grad_norm": 0.601259708404541,
+ "learning_rate": 2.4062031263432267e-06,
+ "loss": 0.4268,
+ "step": 700
+ },
+ {
+ "epoch": 3.30564263322884,
+ "grad_norm": 0.570606529712677,
+ "learning_rate": 2.3995068400848785e-06,
+ "loss": 0.4034,
+ "step": 701
+ },
+ {
+ "epoch": 3.310344827586207,
+ "grad_norm": 0.5638160705566406,
+ "learning_rate": 2.392811275898963e-06,
+ "loss": 0.4212,
+ "step": 702
+ },
+ {
+ "epoch": 3.3150470219435735,
+ "grad_norm": 0.5354572534561157,
+ "learning_rate": 2.3861164818950448e-06,
+ "loss": 0.3893,
+ "step": 703
+ },
+ {
+ "epoch": 3.3197492163009406,
+ "grad_norm": 0.5149163603782654,
+ "learning_rate": 2.379422506177157e-06,
+ "loss": 0.4126,
+ "step": 704
+ },
+ {
+ "epoch": 3.324451410658307,
+ "grad_norm": 0.5132194757461548,
+ "learning_rate": 2.372729396843453e-06,
+ "loss": 0.4132,
+ "step": 705
+ },
+ {
+ "epoch": 3.329153605015674,
+ "grad_norm": 0.5163543224334717,
+ "learning_rate": 2.366037201985858e-06,
+ "loss": 0.418,
+ "step": 706
+ },
+ {
+ "epoch": 3.333855799373041,
+ "grad_norm": 0.5132508277893066,
+ "learning_rate": 2.3593459696897294e-06,
+ "loss": 0.3944,
+ "step": 707
+ },
+ {
+ "epoch": 3.3385579937304075,
+ "grad_norm": 0.5490009188652039,
+ "learning_rate": 2.352655748033508e-06,
+ "loss": 0.414,
+ "step": 708
+ },
+ {
+ "epoch": 3.343260188087774,
+ "grad_norm": 0.5879104733467102,
+ "learning_rate": 2.3459665850883704e-06,
+ "loss": 0.4344,
+ "step": 709
+ },
+ {
+ "epoch": 3.347962382445141,
+ "grad_norm": 0.5451306700706482,
+ "learning_rate": 2.33927852891789e-06,
+ "loss": 0.4208,
+ "step": 710
+ },
+ {
+ "epoch": 3.352664576802508,
+ "grad_norm": 0.5207070708274841,
+ "learning_rate": 2.3325916275776834e-06,
+ "loss": 0.4398,
+ "step": 711
+ },
+ {
+ "epoch": 3.3573667711598745,
+ "grad_norm": 0.5440477132797241,
+ "learning_rate": 2.3259059291150744e-06,
+ "loss": 0.4015,
+ "step": 712
+ },
+ {
+ "epoch": 3.3620689655172415,
+ "grad_norm": 0.5619958639144897,
+ "learning_rate": 2.319221481568739e-06,
+ "loss": 0.4196,
+ "step": 713
+ },
+ {
+ "epoch": 3.366771159874608,
+ "grad_norm": 0.6007470488548279,
+ "learning_rate": 2.3125383329683666e-06,
+ "loss": 0.4217,
+ "step": 714
+ },
+ {
+ "epoch": 3.371473354231975,
+ "grad_norm": 0.4972032904624939,
+ "learning_rate": 2.3058565313343152e-06,
+ "loss": 0.3904,
+ "step": 715
+ },
+ {
+ "epoch": 3.376175548589342,
+ "grad_norm": 0.5420966148376465,
+ "learning_rate": 2.2991761246772623e-06,
+ "loss": 0.4048,
+ "step": 716
+ },
+ {
+ "epoch": 3.3808777429467085,
+ "grad_norm": 0.520063042640686,
+ "learning_rate": 2.2924971609978623e-06,
+ "loss": 0.3965,
+ "step": 717
+ },
+ {
+ "epoch": 3.385579937304075,
+ "grad_norm": 0.8903913497924805,
+ "learning_rate": 2.285819688286403e-06,
+ "loss": 0.3873,
+ "step": 718
+ },
+ {
+ "epoch": 3.3902821316614418,
+ "grad_norm": 0.5380633473396301,
+ "learning_rate": 2.2791437545224563e-06,
+ "loss": 0.4335,
+ "step": 719
+ },
+ {
+ "epoch": 3.394984326018809,
+ "grad_norm": 0.5058356523513794,
+ "learning_rate": 2.2724694076745397e-06,
+ "loss": 0.4134,
+ "step": 720
+ },
+ {
+ "epoch": 3.3996865203761755,
+ "grad_norm": 0.5383400321006775,
+ "learning_rate": 2.265796695699766e-06,
+ "loss": 0.4154,
+ "step": 721
+ },
+ {
+ "epoch": 3.4043887147335425,
+ "grad_norm": 0.5831345319747925,
+ "learning_rate": 2.2591256665434998e-06,
+ "loss": 0.4193,
+ "step": 722
+ },
+ {
+ "epoch": 3.409090909090909,
+ "grad_norm": 0.5494023561477661,
+ "learning_rate": 2.252456368139019e-06,
+ "loss": 0.4137,
+ "step": 723
+ },
+ {
+ "epoch": 3.413793103448276,
+ "grad_norm": 0.5735755562782288,
+ "learning_rate": 2.245788848407159e-06,
+ "loss": 0.4211,
+ "step": 724
+ },
+ {
+ "epoch": 3.4184952978056424,
+ "grad_norm": 0.5244953036308289,
+ "learning_rate": 2.2391231552559815e-06,
+ "loss": 0.4194,
+ "step": 725
+ },
+ {
+ "epoch": 3.4231974921630095,
+ "grad_norm": 0.5803194642066956,
+ "learning_rate": 2.2324593365804184e-06,
+ "loss": 0.3882,
+ "step": 726
+ },
+ {
+ "epoch": 3.427899686520376,
+ "grad_norm": 0.5303656458854675,
+ "learning_rate": 2.225797440261936e-06,
+ "loss": 0.4336,
+ "step": 727
+ },
+ {
+ "epoch": 3.4326018808777428,
+ "grad_norm": 0.6270896792411804,
+ "learning_rate": 2.219137514168187e-06,
+ "loss": 0.397,
+ "step": 728
+ },
+ {
+ "epoch": 3.43730407523511,
+ "grad_norm": 0.5054409503936768,
+ "learning_rate": 2.212479606152667e-06,
+ "loss": 0.4261,
+ "step": 729
+ },
+ {
+ "epoch": 3.4420062695924765,
+ "grad_norm": 0.5422618985176086,
+ "learning_rate": 2.205823764054372e-06,
+ "loss": 0.4105,
+ "step": 730
+ },
+ {
+ "epoch": 3.446708463949843,
+ "grad_norm": 0.5200968980789185,
+ "learning_rate": 2.199170035697453e-06,
+ "loss": 0.4048,
+ "step": 731
+ },
+ {
+ "epoch": 3.45141065830721,
+ "grad_norm": 0.5316998362541199,
+ "learning_rate": 2.1925184688908735e-06,
+ "loss": 0.4132,
+ "step": 732
+ },
+ {
+ "epoch": 3.456112852664577,
+ "grad_norm": 0.5780388116836548,
+ "learning_rate": 2.185869111428067e-06,
+ "loss": 0.4381,
+ "step": 733
+ },
+ {
+ "epoch": 3.4608150470219434,
+ "grad_norm": 0.5547174215316772,
+ "learning_rate": 2.1792220110865885e-06,
+ "loss": 0.4236,
+ "step": 734
+ },
+ {
+ "epoch": 3.4655172413793105,
+ "grad_norm": 0.5188453197479248,
+ "learning_rate": 2.1725772156277795e-06,
+ "loss": 0.4052,
+ "step": 735
+ },
+ {
+ "epoch": 3.470219435736677,
+ "grad_norm": 0.5145602822303772,
+ "learning_rate": 2.165934772796417e-06,
+ "loss": 0.412,
+ "step": 736
+ },
+ {
+ "epoch": 3.4749216300940438,
+ "grad_norm": 0.5960094332695007,
+ "learning_rate": 2.159294730320374e-06,
+ "loss": 0.426,
+ "step": 737
+ },
+ {
+ "epoch": 3.479623824451411,
+ "grad_norm": 0.7090360522270203,
+ "learning_rate": 2.15265713591028e-06,
+ "loss": 0.4133,
+ "step": 738
+ },
+ {
+ "epoch": 3.4843260188087775,
+ "grad_norm": 0.5428952574729919,
+ "learning_rate": 2.1460220372591676e-06,
+ "loss": 0.4332,
+ "step": 739
+ },
+ {
+ "epoch": 3.489028213166144,
+ "grad_norm": 0.6610196232795715,
+ "learning_rate": 2.139389482042142e-06,
+ "loss": 0.3985,
+ "step": 740
+ },
+ {
+ "epoch": 3.493730407523511,
+ "grad_norm": 0.5409770607948303,
+ "learning_rate": 2.1327595179160332e-06,
+ "loss": 0.4148,
+ "step": 741
+ },
+ {
+ "epoch": 3.498432601880878,
+ "grad_norm": 0.8822159171104431,
+ "learning_rate": 2.1261321925190492e-06,
+ "loss": 0.4071,
+ "step": 742
+ },
+ {
+ "epoch": 3.5031347962382444,
+ "grad_norm": 0.5366957783699036,
+ "learning_rate": 2.1195075534704433e-06,
+ "loss": 0.3838,
+ "step": 743
+ },
+ {
+ "epoch": 3.507836990595611,
+ "grad_norm": 0.5289701819419861,
+ "learning_rate": 2.1128856483701625e-06,
+ "loss": 0.4123,
+ "step": 744
+ },
+ {
+ "epoch": 3.512539184952978,
+ "grad_norm": 0.5737835764884949,
+ "learning_rate": 2.10626652479851e-06,
+ "loss": 0.392,
+ "step": 745
+ },
+ {
+ "epoch": 3.5172413793103448,
+ "grad_norm": 0.5381962060928345,
+ "learning_rate": 2.0996502303158057e-06,
+ "loss": 0.4088,
+ "step": 746
+ },
+ {
+ "epoch": 3.521943573667712,
+ "grad_norm": 0.529466450214386,
+ "learning_rate": 2.0930368124620385e-06,
+ "loss": 0.4098,
+ "step": 747
+ },
+ {
+ "epoch": 3.5266457680250785,
+ "grad_norm": 0.6686971783638,
+ "learning_rate": 2.086426318756531e-06,
+ "loss": 0.4273,
+ "step": 748
+ },
+ {
+ "epoch": 3.531347962382445,
+ "grad_norm": 0.5246966481208801,
+ "learning_rate": 2.0798187966975917e-06,
+ "loss": 0.4318,
+ "step": 749
+ },
+ {
+ "epoch": 3.5360501567398117,
+ "grad_norm": 0.5165736675262451,
+ "learning_rate": 2.073214293762179e-06,
+ "loss": 0.4212,
+ "step": 750
+ },
+ {
+ "epoch": 3.540752351097179,
+ "grad_norm": 0.6821503043174744,
+ "learning_rate": 2.0666128574055575e-06,
+ "loss": 0.4199,
+ "step": 751
+ },
+ {
+ "epoch": 3.5454545454545454,
+ "grad_norm": 0.5294732451438904,
+ "learning_rate": 2.0600145350609585e-06,
+ "loss": 0.4192,
+ "step": 752
+ },
+ {
+ "epoch": 3.5501567398119125,
+ "grad_norm": 0.515800416469574,
+ "learning_rate": 2.053419374139235e-06,
+ "loss": 0.4172,
+ "step": 753
+ },
+ {
+ "epoch": 3.554858934169279,
+ "grad_norm": 0.5241639614105225,
+ "learning_rate": 2.0468274220285295e-06,
+ "loss": 0.4138,
+ "step": 754
+ },
+ {
+ "epoch": 3.5595611285266457,
+ "grad_norm": 0.546105146408081,
+ "learning_rate": 2.0402387260939224e-06,
+ "loss": 0.4123,
+ "step": 755
+ },
+ {
+ "epoch": 3.5642633228840124,
+ "grad_norm": 0.5261510014533997,
+ "learning_rate": 2.033653333677103e-06,
+ "loss": 0.4225,
+ "step": 756
+ },
+ {
+ "epoch": 3.5689655172413794,
+ "grad_norm": 0.5825217366218567,
+ "learning_rate": 2.02707129209602e-06,
+ "loss": 0.4042,
+ "step": 757
+ },
+ {
+ "epoch": 3.573667711598746,
+ "grad_norm": 0.5916388034820557,
+ "learning_rate": 2.0204926486445463e-06,
+ "loss": 0.4222,
+ "step": 758
+ },
+ {
+ "epoch": 3.5783699059561127,
+ "grad_norm": 0.5643376708030701,
+ "learning_rate": 2.0139174505921403e-06,
+ "loss": 0.4419,
+ "step": 759
+ },
+ {
+ "epoch": 3.58307210031348,
+ "grad_norm": 0.5426534414291382,
+ "learning_rate": 2.0073457451835e-06,
+ "loss": 0.3985,
+ "step": 760
+ },
+ {
+ "epoch": 3.5877742946708464,
+ "grad_norm": 0.48811203241348267,
+ "learning_rate": 2.0007775796382335e-06,
+ "loss": 0.4249,
+ "step": 761
+ },
+ {
+ "epoch": 3.592476489028213,
+ "grad_norm": 0.5216817855834961,
+ "learning_rate": 1.994213001150508e-06,
+ "loss": 0.3931,
+ "step": 762
+ },
+ {
+ "epoch": 3.5971786833855797,
+ "grad_norm": 0.5739433169364929,
+ "learning_rate": 1.9876520568887207e-06,
+ "loss": 0.42,
+ "step": 763
+ },
+ {
+ "epoch": 3.6018808777429467,
+ "grad_norm": 0.5166419148445129,
+ "learning_rate": 1.981094793995155e-06,
+ "loss": 0.4041,
+ "step": 764
+ },
+ {
+ "epoch": 3.6065830721003134,
+ "grad_norm": 0.6763928532600403,
+ "learning_rate": 1.974541259585641e-06,
+ "loss": 0.4319,
+ "step": 765
+ },
+ {
+ "epoch": 3.6112852664576804,
+ "grad_norm": 0.5443664789199829,
+ "learning_rate": 1.9679915007492194e-06,
+ "loss": 0.4139,
+ "step": 766
+ },
+ {
+ "epoch": 3.615987460815047,
+ "grad_norm": 0.6719280481338501,
+ "learning_rate": 1.9614455645478047e-06,
+ "loss": 0.4015,
+ "step": 767
+ },
+ {
+ "epoch": 3.6206896551724137,
+ "grad_norm": 0.5685383677482605,
+ "learning_rate": 1.9549034980158403e-06,
+ "loss": 0.4153,
+ "step": 768
+ },
+ {
+ "epoch": 3.6253918495297803,
+ "grad_norm": 0.5463993549346924,
+ "learning_rate": 1.9483653481599697e-06,
+ "loss": 0.4193,
+ "step": 769
+ },
+ {
+ "epoch": 3.6300940438871474,
+ "grad_norm": 0.5228095054626465,
+ "learning_rate": 1.9418311619586897e-06,
+ "loss": 0.4268,
+ "step": 770
+ },
+ {
+ "epoch": 3.634796238244514,
+ "grad_norm": 0.6472461223602295,
+ "learning_rate": 1.935300986362018e-06,
+ "loss": 0.3981,
+ "step": 771
+ },
+ {
+ "epoch": 3.639498432601881,
+ "grad_norm": 0.61808842420578,
+ "learning_rate": 1.9287748682911582e-06,
+ "loss": 0.4313,
+ "step": 772
+ },
+ {
+ "epoch": 3.6442006269592477,
+ "grad_norm": 0.5122710466384888,
+ "learning_rate": 1.9222528546381543e-06,
+ "loss": 0.4219,
+ "step": 773
+ },
+ {
+ "epoch": 3.6489028213166144,
+ "grad_norm": 0.5540320873260498,
+ "learning_rate": 1.9157349922655648e-06,
+ "loss": 0.4001,
+ "step": 774
+ },
+ {
+ "epoch": 3.653605015673981,
+ "grad_norm": 0.5066401958465576,
+ "learning_rate": 1.909221328006114e-06,
+ "loss": 0.4089,
+ "step": 775
+ },
+ {
+ "epoch": 3.658307210031348,
+ "grad_norm": 0.5802583694458008,
+ "learning_rate": 1.9027119086623647e-06,
+ "loss": 0.4216,
+ "step": 776
+ },
+ {
+ "epoch": 3.6630094043887147,
+ "grad_norm": 0.5735054016113281,
+ "learning_rate": 1.8962067810063806e-06,
+ "loss": 0.4372,
+ "step": 777
+ },
+ {
+ "epoch": 3.6677115987460818,
+ "grad_norm": 0.5177802443504333,
+ "learning_rate": 1.8897059917793844e-06,
+ "loss": 0.3912,
+ "step": 778
+ },
+ {
+ "epoch": 3.6724137931034484,
+ "grad_norm": 0.5790892243385315,
+ "learning_rate": 1.8832095876914268e-06,
+ "loss": 0.4096,
+ "step": 779
+ },
+ {
+ "epoch": 3.677115987460815,
+ "grad_norm": 0.5386017560958862,
+ "learning_rate": 1.8767176154210537e-06,
+ "loss": 0.4191,
+ "step": 780
+ },
+ {
+ "epoch": 3.6818181818181817,
+ "grad_norm": 0.5927474498748779,
+ "learning_rate": 1.8702301216149616e-06,
+ "loss": 0.4061,
+ "step": 781
+ },
+ {
+ "epoch": 3.6865203761755487,
+ "grad_norm": 0.5609317421913147,
+ "learning_rate": 1.8637471528876727e-06,
+ "loss": 0.4067,
+ "step": 782
+ },
+ {
+ "epoch": 3.6912225705329154,
+ "grad_norm": 0.6609043478965759,
+ "learning_rate": 1.8572687558211923e-06,
+ "loss": 0.4183,
+ "step": 783
+ },
+ {
+ "epoch": 3.695924764890282,
+ "grad_norm": 0.5092527270317078,
+ "learning_rate": 1.850794976964677e-06,
+ "loss": 0.3827,
+ "step": 784
+ },
+ {
+ "epoch": 3.700626959247649,
+ "grad_norm": 0.8918034434318542,
+ "learning_rate": 1.8443258628341026e-06,
+ "loss": 0.4144,
+ "step": 785
+ },
+ {
+ "epoch": 3.7053291536050157,
+ "grad_norm": 0.5443233847618103,
+ "learning_rate": 1.837861459911925e-06,
+ "loss": 0.4246,
+ "step": 786
+ },
+ {
+ "epoch": 3.7100313479623823,
+ "grad_norm": 0.6559080481529236,
+ "learning_rate": 1.8314018146467505e-06,
+ "loss": 0.4067,
+ "step": 787
+ },
+ {
+ "epoch": 3.714733542319749,
+ "grad_norm": 0.5071741342544556,
+ "learning_rate": 1.8249469734529995e-06,
+ "loss": 0.3888,
+ "step": 788
+ },
+ {
+ "epoch": 3.719435736677116,
+ "grad_norm": 0.5663676261901855,
+ "learning_rate": 1.818496982710572e-06,
+ "loss": 0.4256,
+ "step": 789
+ },
+ {
+ "epoch": 3.7241379310344827,
+ "grad_norm": 0.5477777719497681,
+ "learning_rate": 1.81205188876452e-06,
+ "loss": 0.423,
+ "step": 790
+ },
+ {
+ "epoch": 3.7288401253918497,
+ "grad_norm": 0.5709276795387268,
+ "learning_rate": 1.8056117379247078e-06,
+ "loss": 0.4265,
+ "step": 791
+ },
+ {
+ "epoch": 3.7335423197492164,
+ "grad_norm": 0.49602681398391724,
+ "learning_rate": 1.7991765764654813e-06,
+ "loss": 0.4141,
+ "step": 792
+ },
+ {
+ "epoch": 3.738244514106583,
+ "grad_norm": 0.5358700156211853,
+ "learning_rate": 1.7927464506253394e-06,
+ "loss": 0.4231,
+ "step": 793
+ },
+ {
+ "epoch": 3.7429467084639496,
+ "grad_norm": 1.1592613458633423,
+ "learning_rate": 1.7863214066065951e-06,
+ "loss": 0.3929,
+ "step": 794
+ },
+ {
+ "epoch": 3.7476489028213167,
+ "grad_norm": 0.5176786780357361,
+ "learning_rate": 1.779901490575051e-06,
+ "loss": 0.4201,
+ "step": 795
+ },
+ {
+ "epoch": 3.7523510971786833,
+ "grad_norm": 0.5303675532341003,
+ "learning_rate": 1.7734867486596596e-06,
+ "loss": 0.4201,
+ "step": 796
+ },
+ {
+ "epoch": 3.7570532915360504,
+ "grad_norm": 0.5633402466773987,
+ "learning_rate": 1.767077226952198e-06,
+ "loss": 0.4276,
+ "step": 797
+ },
+ {
+ "epoch": 3.761755485893417,
+ "grad_norm": 0.6016635894775391,
+ "learning_rate": 1.7606729715069349e-06,
+ "loss": 0.4143,
+ "step": 798
+ },
+ {
+ "epoch": 3.7664576802507836,
+ "grad_norm": 0.5202106237411499,
+ "learning_rate": 1.7542740283402981e-06,
+ "loss": 0.4195,
+ "step": 799
+ },
+ {
+ "epoch": 3.7711598746081503,
+ "grad_norm": 0.6279420852661133,
+ "learning_rate": 1.7478804434305466e-06,
+ "loss": 0.4001,
+ "step": 800
+ },
+ {
+ "epoch": 3.7758620689655173,
+ "grad_norm": 0.5253601670265198,
+ "learning_rate": 1.741492262717438e-06,
+ "loss": 0.4206,
+ "step": 801
+ },
+ {
+ "epoch": 3.780564263322884,
+ "grad_norm": 0.5218167901039124,
+ "learning_rate": 1.7351095321018974e-06,
+ "loss": 0.387,
+ "step": 802
+ },
+ {
+ "epoch": 3.785266457680251,
+ "grad_norm": 0.530846357345581,
+ "learning_rate": 1.7287322974456933e-06,
+ "loss": 0.3935,
+ "step": 803
+ },
+ {
+ "epoch": 3.7899686520376177,
+ "grad_norm": 0.5487862825393677,
+ "learning_rate": 1.7223606045711006e-06,
+ "loss": 0.4168,
+ "step": 804
+ },
+ {
+ "epoch": 3.7946708463949843,
+ "grad_norm": 0.5345083475112915,
+ "learning_rate": 1.7159944992605774e-06,
+ "loss": 0.4208,
+ "step": 805
+ },
+ {
+ "epoch": 3.799373040752351,
+ "grad_norm": 0.5425072312355042,
+ "learning_rate": 1.7096340272564318e-06,
+ "loss": 0.4088,
+ "step": 806
+ },
+ {
+ "epoch": 3.804075235109718,
+ "grad_norm": 0.5253011584281921,
+ "learning_rate": 1.7032792342604947e-06,
+ "loss": 0.3995,
+ "step": 807
+ },
+ {
+ "epoch": 3.8087774294670846,
+ "grad_norm": 0.7746017575263977,
+ "learning_rate": 1.6969301659337944e-06,
+ "loss": 0.4145,
+ "step": 808
+ },
+ {
+ "epoch": 3.8134796238244513,
+ "grad_norm": 0.7049569487571716,
+ "learning_rate": 1.6905868678962225e-06,
+ "loss": 0.4216,
+ "step": 809
+ },
+ {
+ "epoch": 3.8181818181818183,
+ "grad_norm": 0.602180540561676,
+ "learning_rate": 1.684249385726211e-06,
+ "loss": 0.4134,
+ "step": 810
+ },
+ {
+ "epoch": 3.822884012539185,
+ "grad_norm": 0.5291408896446228,
+ "learning_rate": 1.677917764960404e-06,
+ "loss": 0.402,
+ "step": 811
+ },
+ {
+ "epoch": 3.8275862068965516,
+ "grad_norm": 0.5529280304908752,
+ "learning_rate": 1.6715920510933277e-06,
+ "loss": 0.4322,
+ "step": 812
+ },
+ {
+ "epoch": 3.8322884012539182,
+ "grad_norm": 0.5989758968353271,
+ "learning_rate": 1.6652722895770676e-06,
+ "loss": 0.4275,
+ "step": 813
+ },
+ {
+ "epoch": 3.8369905956112853,
+ "grad_norm": 0.5088624358177185,
+ "learning_rate": 1.6589585258209383e-06,
+ "loss": 0.378,
+ "step": 814
+ },
+ {
+ "epoch": 3.841692789968652,
+ "grad_norm": 0.5167607665061951,
+ "learning_rate": 1.6526508051911588e-06,
+ "loss": 0.4221,
+ "step": 815
+ },
+ {
+ "epoch": 3.846394984326019,
+ "grad_norm": 0.5582865476608276,
+ "learning_rate": 1.6463491730105282e-06,
+ "loss": 0.4091,
+ "step": 816
+ },
+ {
+ "epoch": 3.8510971786833856,
+ "grad_norm": 0.5103083252906799,
+ "learning_rate": 1.6400536745580955e-06,
+ "loss": 0.3867,
+ "step": 817
+ },
+ {
+ "epoch": 3.8557993730407523,
+ "grad_norm": 0.528692901134491,
+ "learning_rate": 1.6337643550688408e-06,
+ "loss": 0.4178,
+ "step": 818
+ },
+ {
+ "epoch": 3.860501567398119,
+ "grad_norm": 0.5174258947372437,
+ "learning_rate": 1.627481259733343e-06,
+ "loss": 0.3989,
+ "step": 819
+ },
+ {
+ "epoch": 3.865203761755486,
+ "grad_norm": 0.492735892534256,
+ "learning_rate": 1.6212044336974598e-06,
+ "loss": 0.3935,
+ "step": 820
+ },
+ {
+ "epoch": 3.8699059561128526,
+ "grad_norm": 0.5810956954956055,
+ "learning_rate": 1.614933922062003e-06,
+ "loss": 0.4082,
+ "step": 821
+ },
+ {
+ "epoch": 3.8746081504702197,
+ "grad_norm": 0.5235511660575867,
+ "learning_rate": 1.6086697698824144e-06,
+ "loss": 0.4026,
+ "step": 822
+ },
+ {
+ "epoch": 3.8793103448275863,
+ "grad_norm": 0.5972744822502136,
+ "learning_rate": 1.6024120221684373e-06,
+ "loss": 0.4018,
+ "step": 823
+ },
+ {
+ "epoch": 3.884012539184953,
+ "grad_norm": 0.5685083270072937,
+ "learning_rate": 1.5961607238838022e-06,
+ "loss": 0.4077,
+ "step": 824
+ },
+ {
+ "epoch": 3.8887147335423196,
+ "grad_norm": 0.5427765250205994,
+ "learning_rate": 1.589915919945894e-06,
+ "loss": 0.4187,
+ "step": 825
+ },
+ {
+ "epoch": 3.8934169278996866,
+ "grad_norm": 0.6297295093536377,
+ "learning_rate": 1.5836776552254386e-06,
+ "loss": 0.4367,
+ "step": 826
+ },
+ {
+ "epoch": 3.8981191222570533,
+ "grad_norm": 0.6110124588012695,
+ "learning_rate": 1.5774459745461711e-06,
+ "loss": 0.4065,
+ "step": 827
+ },
+ {
+ "epoch": 3.9028213166144203,
+ "grad_norm": 0.4981592297554016,
+ "learning_rate": 1.5712209226845201e-06,
+ "loss": 0.3836,
+ "step": 828
+ },
+ {
+ "epoch": 3.907523510971787,
+ "grad_norm": 0.5722451210021973,
+ "learning_rate": 1.565002544369286e-06,
+ "loss": 0.4161,
+ "step": 829
+ },
+ {
+ "epoch": 3.9122257053291536,
+ "grad_norm": 0.6718733310699463,
+ "learning_rate": 1.5587908842813142e-06,
+ "loss": 0.4053,
+ "step": 830
+ },
+ {
+ "epoch": 3.91692789968652,
+ "grad_norm": 0.5070095658302307,
+ "learning_rate": 1.5525859870531823e-06,
+ "loss": 0.4198,
+ "step": 831
+ },
+ {
+ "epoch": 3.9216300940438873,
+ "grad_norm": 0.5303407311439514,
+ "learning_rate": 1.5463878972688707e-06,
+ "loss": 0.4089,
+ "step": 832
+ },
+ {
+ "epoch": 3.926332288401254,
+ "grad_norm": 0.5431908369064331,
+ "learning_rate": 1.5401966594634483e-06,
+ "loss": 0.4341,
+ "step": 833
+ },
+ {
+ "epoch": 3.9310344827586206,
+ "grad_norm": 0.549174427986145,
+ "learning_rate": 1.5340123181227495e-06,
+ "loss": 0.4237,
+ "step": 834
+ },
+ {
+ "epoch": 3.9357366771159876,
+ "grad_norm": 0.8902267217636108,
+ "learning_rate": 1.527834917683058e-06,
+ "loss": 0.3904,
+ "step": 835
+ },
+ {
+ "epoch": 3.9404388714733543,
+ "grad_norm": 0.5055849552154541,
+ "learning_rate": 1.5216645025307813e-06,
+ "loss": 0.4058,
+ "step": 836
+ },
+ {
+ "epoch": 3.945141065830721,
+ "grad_norm": 0.5319788455963135,
+ "learning_rate": 1.5155011170021399e-06,
+ "loss": 0.4153,
+ "step": 837
+ },
+ {
+ "epoch": 3.9498432601880875,
+ "grad_norm": 0.5441375374794006,
+ "learning_rate": 1.5093448053828402e-06,
+ "loss": 0.4231,
+ "step": 838
+ },
+ {
+ "epoch": 3.9545454545454546,
+ "grad_norm": 0.5940942764282227,
+ "learning_rate": 1.503195611907764e-06,
+ "loss": 0.4241,
+ "step": 839
+ },
+ {
+ "epoch": 3.959247648902821,
+ "grad_norm": 0.5203325748443604,
+ "learning_rate": 1.4970535807606453e-06,
+ "loss": 0.3842,
+ "step": 840
+ },
+ {
+ "epoch": 3.9639498432601883,
+ "grad_norm": 0.525404691696167,
+ "learning_rate": 1.4909187560737542e-06,
+ "loss": 0.3954,
+ "step": 841
+ },
+ {
+ "epoch": 3.968652037617555,
+ "grad_norm": 0.5999636054039001,
+ "learning_rate": 1.4847911819275829e-06,
+ "loss": 0.4061,
+ "step": 842
+ },
+ {
+ "epoch": 3.9733542319749215,
+ "grad_norm": 0.5253078937530518,
+ "learning_rate": 1.4786709023505224e-06,
+ "loss": 0.3969,
+ "step": 843
+ },
+ {
+ "epoch": 3.978056426332288,
+ "grad_norm": 0.535467803478241,
+ "learning_rate": 1.4725579613185549e-06,
+ "loss": 0.4241,
+ "step": 844
+ },
+ {
+ "epoch": 3.9827586206896552,
+ "grad_norm": 0.5458933711051941,
+ "learning_rate": 1.4664524027549291e-06,
+ "loss": 0.4102,
+ "step": 845
+ },
+ {
+ "epoch": 3.987460815047022,
+ "grad_norm": 0.515102207660675,
+ "learning_rate": 1.4603542705298493e-06,
+ "loss": 0.3957,
+ "step": 846
+ },
+ {
+ "epoch": 3.992163009404389,
+ "grad_norm": 0.572600245475769,
+ "learning_rate": 1.4542636084601624e-06,
+ "loss": 0.3686,
+ "step": 847
+ },
+ {
+ "epoch": 3.9968652037617556,
+ "grad_norm": 0.520165205001831,
+ "learning_rate": 1.4481804603090358e-06,
+ "loss": 0.4109,
+ "step": 848
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 1272,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 212,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 5.461772594205793e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-848/training_args.bin b/checkpoint-848/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7db90ca60ea3c300feb3b7d6e0cb54fc7cfb2060
--- /dev/null
+++ b/checkpoint-848/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51f85402b182fc4b86518e0cb9ca9cbf150300e36000a38f53507b9a8663ad4b
+size 7928
diff --git a/checkpoint-848/zero_to_fp32.py b/checkpoint-848/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-848/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d764aead97a826f7aae3b9cfcfe2606e1d2eeec
--- /dev/null
+++ b/config.json
@@ -0,0 +1,52 @@
+{
+ "_attn_implementation_autoset": true,
+ "_name_or_path": "meta-llama/Llama-3.3-70B-Instruct",
+ "architectures": [
+ "LlamaForCausalLM"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "bos_token_id": 128000,
+ "eos_token_id": 128009,
+ "head_dim": 128,
+ "hidden_act": "silu",
+ "hidden_size": 8192,
+ "initializer_range": 0.02,
+ "intermediate_size": 28672,
+ "max_position_embeddings": 131072,
+ "mlp_bias": false,
+ "model_type": "llama",
+ "num_attention_heads": 64,
+ "num_hidden_layers": 80,
+ "num_key_value_heads": 8,
+ "pretraining_tp": 1,
+ "quantization_config": {
+ "_load_in_4bit": true,
+ "_load_in_8bit": false,
+ "bnb_4bit_compute_dtype": "bfloat16",
+ "bnb_4bit_quant_storage": "bfloat16",
+ "bnb_4bit_quant_type": "nf4",
+ "bnb_4bit_use_double_quant": true,
+ "llm_int8_enable_fp32_cpu_offload": false,
+ "llm_int8_has_fp16_weight": false,
+ "llm_int8_skip_modules": null,
+ "llm_int8_threshold": 6.0,
+ "load_in_4bit": true,
+ "load_in_8bit": false,
+ "quant_method": "bitsandbytes"
+ },
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": {
+ "factor": 8.0,
+ "high_freq_factor": 4.0,
+ "low_freq_factor": 1.0,
+ "original_max_position_embeddings": 8192,
+ "rope_type": "llama3"
+ },
+ "rope_theta": 500000.0,
+ "tie_word_embeddings": false,
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.49.0",
+ "use_cache": false,
+ "vocab_size": 128256
+}
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}