diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..bd6120e534915dba6dbb4e7599b9746b60e19a86 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-114/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-228/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-342/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-456/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-570/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-684/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d5c526ddfa14ccbc17e50d783309db19280521b8 --- /dev/null +++ b/README.md @@ -0,0 +1,146 @@ +--- +library_name: peft +license: other +base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1 +tags: +- generated_from_trainer +datasets: +- ugaoo/multimedqa_and_wrongonesllama +model-index: +- name: out/multimedqa_and_wrongonesllama + results: [] +--- + + + +[Built with Axolotl](https://github.com/axolotl-ai-cloud/axolotl) +
See axolotl config + +axolotl version: `0.8.0.dev0` +```yaml +base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1 +model_type: AutoModelForCausalLM +tokenizer_type: AutoTokenizer +trust_remote_code: true + +load_in_8bit: false +load_in_4bit: true +strict: false + +datasets: + - path: ugaoo/multimedqa_and_wrongonesllama + type: alpaca +val_set_size: 0 +output_dir: ./out/multimedqa_and_wrongonesllama + +sequence_len: 4000 +sample_packing: true +pad_to_sequence_len: true + +adapter: qlora +lora_r: 256 +lora_alpha: 512 +lora_dropout: 0.05 +lora_target_linear: true +lora_target_modules: + - q_proj + - k_proj + - v_proj + - o_proj + - up_proj + - down_proj + - gate_proj +lora_modules_to_save: + - embed_tokens + - lm_head + +wandb_project: cosmosearch +wandb_entity: +wandb_watch: +wandb_name: multimedqa_and_wrongonesllama_Super-49B +wandb_log_model: + +gradient_accumulation_steps: 3 +micro_batch_size: 4 +num_epochs: 6 +optimizer: adamw_torch +lr_scheduler: cosine +learning_rate: 5e-6 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 100 +evals_per_epoch: 6 +eval_table_size: +saves_per_epoch: 1 +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +save_total_limit: 6 +special_tokens: + pad_token: <|end_of_text|> + +``` + +

+ +# out/multimedqa_and_wrongonesllama + +This model is a fine-tuned version of [nvidia/Llama-3_3-Nemotron-Super-49B-v1](https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1) on the ugaoo/multimedqa_and_wrongonesllama dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-06 +- train_batch_size: 4 +- eval_batch_size: 4 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 2 +- gradient_accumulation_steps: 3 +- total_train_batch_size: 24 +- total_eval_batch_size: 8 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 100 +- num_epochs: 6.0 + +### Training results + + + +### Framework versions + +- PEFT 0.15.0 +- Transformers 4.49.0 +- Pytorch 2.5.1+cu124 +- Datasets 3.4.1 +- Tokenizers 0.21.1 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d071e8337a127c8780a346e6e69c4e2195786154 --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c08cabaa331365104eda0f955b3bcca40f58f5ba2408e03aedf9cc235c104191 +size 9016826528 diff --git a/checkpoint-114/README.md b/checkpoint-114/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f4a3934800eeb082a0cb833d7b6af4f68eed3615 --- /dev/null +++ b/checkpoint-114/README.md @@ -0,0 +1,202 @@ +--- +base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-114/adapter_config.json b/checkpoint-114/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be --- /dev/null +++ b/checkpoint-114/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-114/adapter_model.safetensors b/checkpoint-114/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f4239b78cce51457a023d3b245d6dd89bd6bbe36 --- /dev/null +++ b/checkpoint-114/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c9b6e9543d7c2d41d3306ac6f0fe4cda7267eece06c8587fec3a68b8ba04243 +size 9016826528 diff --git a/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a84d1cc265afdbab6917598c97c7b483552c253 --- /dev/null +++ b/checkpoint-114/global_step114/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eca3ffba8bfb5906d22a15bd9ccc52f42b5339056f7a5836afbe7c8a33cbfbb5 +size 27050164444 diff --git a/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7c5a252a0149e81136e00af57a943b5571ecc98 --- /dev/null +++ b/checkpoint-114/global_step114/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8de04aa8407a0004079a71b924943aa7468b24af275b648a69a53fe0c20db5e +size 27050169884 diff --git a/checkpoint-114/global_step114/mp_rank_00_model_states.pt b/checkpoint-114/global_step114/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c9d96534454ef44d808e002132b2bc109f507cd --- /dev/null +++ b/checkpoint-114/global_step114/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e0478eba317b4d5b9acff1bf96c6c89ae28a8b2e1080f0575c3b24bd186b155 +size 9776788601 diff --git a/checkpoint-114/latest b/checkpoint-114/latest new file mode 100644 index 0000000000000000000000000000000000000000..aad80f76777fd4d23b0b81026f4601524335cbe1 --- /dev/null +++ b/checkpoint-114/latest @@ -0,0 +1 @@ +global_step114 \ No newline at end of file diff --git a/checkpoint-114/rng_state_0.pth b/checkpoint-114/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..3115ef5b3f240303888fd17b7517182de213d964 --- /dev/null +++ b/checkpoint-114/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acededc55cf300dac4729a8ab7c731573a49bfe522164173f4aa200189894bf7 +size 14512 diff --git a/checkpoint-114/rng_state_1.pth b/checkpoint-114/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..8677a911d6d783cf6a6dc5b8b13f6dd17eca4720 --- /dev/null +++ b/checkpoint-114/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:174dbd9d3bdd2e47a45b6b645ec401c6d6b33e4bf885128debfda1d5649a747a +size 14512 diff --git a/checkpoint-114/scheduler.pt b/checkpoint-114/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..40c17d3863167c8f9a6afd933a45e93fda7d96e4 --- /dev/null +++ b/checkpoint-114/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ad006ca6fd06276c4f0d747b779fbbcfdff6edce744bcfd757e846b0536c240 +size 1064 diff --git a/checkpoint-114/special_tokens_map.json b/checkpoint-114/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-114/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-114/tokenizer.json b/checkpoint-114/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-114/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-114/tokenizer_config.json b/checkpoint-114/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74 --- /dev/null +++ b/checkpoint-114/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-114/trainer_state.json b/checkpoint-114/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c75ec46ce1ce29382194d526c7ccab18b22e1bc7 --- /dev/null +++ b/checkpoint-114/trainer_state.json @@ -0,0 +1,831 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 114, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008771929824561403, + "grad_norm": 39.56407165527344, + "learning_rate": 5.0000000000000004e-08, + "loss": 5.1375, + "step": 1 + }, + { + "epoch": 0.017543859649122806, + "grad_norm": 40.30452346801758, + "learning_rate": 1.0000000000000001e-07, + "loss": 5.1185, + "step": 2 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 40.062313079833984, + "learning_rate": 1.5000000000000002e-07, + "loss": 5.0762, + "step": 3 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 39.17148208618164, + "learning_rate": 2.0000000000000002e-07, + "loss": 5.016, + "step": 4 + }, + { + "epoch": 0.043859649122807015, + "grad_norm": 40.67367172241211, + "learning_rate": 2.5000000000000004e-07, + "loss": 5.0428, + "step": 5 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 38.18095016479492, + "learning_rate": 3.0000000000000004e-07, + "loss": 5.2025, + "step": 6 + }, + { + "epoch": 0.06140350877192982, + "grad_norm": 39.12940979003906, + "learning_rate": 3.5000000000000004e-07, + "loss": 4.9896, + "step": 7 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 38.84568405151367, + "learning_rate": 4.0000000000000003e-07, + "loss": 5.1078, + "step": 8 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 39.38333511352539, + "learning_rate": 4.5000000000000003e-07, + "loss": 5.0808, + "step": 9 + }, + { + "epoch": 0.08771929824561403, + "grad_norm": 39.427650451660156, + "learning_rate": 5.000000000000001e-07, + "loss": 5.0534, + "step": 10 + }, + { + "epoch": 0.09649122807017543, + "grad_norm": 39.29513168334961, + "learning_rate": 5.5e-07, + "loss": 5.058, + "step": 11 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 39.641231536865234, + "learning_rate": 6.000000000000001e-07, + "loss": 5.0317, + "step": 12 + }, + { + "epoch": 0.11403508771929824, + "grad_norm": 37.91259765625, + "learning_rate": 6.5e-07, + "loss": 4.912, + "step": 13 + }, + { + "epoch": 0.12280701754385964, + "grad_norm": 38.203548431396484, + "learning_rate": 7.000000000000001e-07, + "loss": 4.9705, + "step": 14 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 39.15998840332031, + "learning_rate": 7.5e-07, + "loss": 4.6962, + "step": 15 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 37.754669189453125, + "learning_rate": 8.000000000000001e-07, + "loss": 4.6262, + "step": 16 + }, + { + "epoch": 0.14912280701754385, + "grad_norm": 35.871490478515625, + "learning_rate": 8.500000000000001e-07, + "loss": 4.5422, + "step": 17 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 36.16888427734375, + "learning_rate": 9.000000000000001e-07, + "loss": 4.664, + "step": 18 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 33.520118713378906, + "learning_rate": 9.500000000000001e-07, + "loss": 4.4697, + "step": 19 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 30.896282196044922, + "learning_rate": 1.0000000000000002e-06, + "loss": 4.3568, + "step": 20 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 29.944643020629883, + "learning_rate": 1.0500000000000001e-06, + "loss": 4.2269, + "step": 21 + }, + { + "epoch": 0.19298245614035087, + "grad_norm": 25.224485397338867, + "learning_rate": 1.1e-06, + "loss": 4.1272, + "step": 22 + }, + { + "epoch": 0.20175438596491227, + "grad_norm": 24.410480499267578, + "learning_rate": 1.1500000000000002e-06, + "loss": 4.0585, + "step": 23 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 21.480648040771484, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.9472, + "step": 24 + }, + { + "epoch": 0.21929824561403508, + "grad_norm": 20.61946678161621, + "learning_rate": 1.25e-06, + "loss": 3.8879, + "step": 25 + }, + { + "epoch": 0.22807017543859648, + "grad_norm": 19.578271865844727, + "learning_rate": 1.3e-06, + "loss": 3.6783, + "step": 26 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 17.418983459472656, + "learning_rate": 1.3500000000000002e-06, + "loss": 3.6826, + "step": 27 + }, + { + "epoch": 0.24561403508771928, + "grad_norm": 18.160301208496094, + "learning_rate": 1.4000000000000001e-06, + "loss": 3.478, + "step": 28 + }, + { + "epoch": 0.2543859649122807, + "grad_norm": 17.573204040527344, + "learning_rate": 1.45e-06, + "loss": 3.459, + "step": 29 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 17.1265869140625, + "learning_rate": 1.5e-06, + "loss": 3.3999, + "step": 30 + }, + { + "epoch": 0.2719298245614035, + "grad_norm": 15.527145385742188, + "learning_rate": 1.5500000000000002e-06, + "loss": 3.2817, + "step": 31 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 14.773847579956055, + "learning_rate": 1.6000000000000001e-06, + "loss": 3.234, + "step": 32 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 12.039301872253418, + "learning_rate": 1.6500000000000003e-06, + "loss": 3.132, + "step": 33 + }, + { + "epoch": 0.2982456140350877, + "grad_norm": 9.217979431152344, + "learning_rate": 1.7000000000000002e-06, + "loss": 3.0548, + "step": 34 + }, + { + "epoch": 0.30701754385964913, + "grad_norm": 7.575639724731445, + "learning_rate": 1.75e-06, + "loss": 2.9529, + "step": 35 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 7.496004104614258, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.8967, + "step": 36 + }, + { + "epoch": 0.32456140350877194, + "grad_norm": 7.45414924621582, + "learning_rate": 1.85e-06, + "loss": 2.8837, + "step": 37 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 8.555658340454102, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.7473, + "step": 38 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 10.03805160522461, + "learning_rate": 1.9500000000000004e-06, + "loss": 2.7355, + "step": 39 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 9.30649471282959, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.6587, + "step": 40 + }, + { + "epoch": 0.35964912280701755, + "grad_norm": 8.510339736938477, + "learning_rate": 2.05e-06, + "loss": 2.5977, + "step": 41 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 4.709080696105957, + "learning_rate": 2.1000000000000002e-06, + "loss": 2.6286, + "step": 42 + }, + { + "epoch": 0.37719298245614036, + "grad_norm": 5.128961086273193, + "learning_rate": 2.15e-06, + "loss": 2.4558, + "step": 43 + }, + { + "epoch": 0.38596491228070173, + "grad_norm": 5.190136432647705, + "learning_rate": 2.2e-06, + "loss": 2.4432, + "step": 44 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 4.893551349639893, + "learning_rate": 2.25e-06, + "loss": 2.4939, + "step": 45 + }, + { + "epoch": 0.40350877192982454, + "grad_norm": 5.2434983253479, + "learning_rate": 2.3000000000000004e-06, + "loss": 2.3381, + "step": 46 + }, + { + "epoch": 0.41228070175438597, + "grad_norm": 5.122412204742432, + "learning_rate": 2.35e-06, + "loss": 2.313, + "step": 47 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 4.577274799346924, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.2236, + "step": 48 + }, + { + "epoch": 0.4298245614035088, + "grad_norm": 4.722769737243652, + "learning_rate": 2.4500000000000003e-06, + "loss": 2.1987, + "step": 49 + }, + { + "epoch": 0.43859649122807015, + "grad_norm": 5.059235095977783, + "learning_rate": 2.5e-06, + "loss": 2.1415, + "step": 50 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 4.454439640045166, + "learning_rate": 2.55e-06, + "loss": 2.0466, + "step": 51 + }, + { + "epoch": 0.45614035087719296, + "grad_norm": 4.94586706161499, + "learning_rate": 2.6e-06, + "loss": 1.8762, + "step": 52 + }, + { + "epoch": 0.4649122807017544, + "grad_norm": 4.704402446746826, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.8012, + "step": 53 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 6.125903129577637, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.7669, + "step": 54 + }, + { + "epoch": 0.4824561403508772, + "grad_norm": 4.5356059074401855, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.6607, + "step": 55 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 6.56803035736084, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.6291, + "step": 56 + }, + { + "epoch": 0.5, + "grad_norm": 4.910050392150879, + "learning_rate": 2.85e-06, + "loss": 1.5545, + "step": 57 + }, + { + "epoch": 0.5087719298245614, + "grad_norm": 8.733433723449707, + "learning_rate": 2.9e-06, + "loss": 1.4206, + "step": 58 + }, + { + "epoch": 0.5175438596491229, + "grad_norm": 8.582486152648926, + "learning_rate": 2.95e-06, + "loss": 1.3912, + "step": 59 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 13.710689544677734, + "learning_rate": 3e-06, + "loss": 1.3297, + "step": 60 + }, + { + "epoch": 0.5350877192982456, + "grad_norm": 23.400312423706055, + "learning_rate": 3.05e-06, + "loss": 1.296, + "step": 61 + }, + { + "epoch": 0.543859649122807, + "grad_norm": 5.678805351257324, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.2259, + "step": 62 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 14.700899124145508, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.1087, + "step": 63 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 19.38919448852539, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.1805, + "step": 64 + }, + { + "epoch": 0.5701754385964912, + "grad_norm": 8.460039138793945, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.0963, + "step": 65 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 13.371014595031738, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.0627, + "step": 66 + }, + { + "epoch": 0.5877192982456141, + "grad_norm": 22.380569458007812, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0869, + "step": 67 + }, + { + "epoch": 0.5964912280701754, + "grad_norm": 5.780513286590576, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9991, + "step": 68 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 19.850841522216797, + "learning_rate": 3.45e-06, + "loss": 0.9683, + "step": 69 + }, + { + "epoch": 0.6140350877192983, + "grad_norm": 17.160703659057617, + "learning_rate": 3.5e-06, + "loss": 0.845, + "step": 70 + }, + { + "epoch": 0.6228070175438597, + "grad_norm": 14.264311790466309, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8059, + "step": 71 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 26.39459991455078, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.85, + "step": 72 + }, + { + "epoch": 0.6403508771929824, + "grad_norm": 51.10348892211914, + "learning_rate": 3.65e-06, + "loss": 0.9755, + "step": 73 + }, + { + "epoch": 0.6491228070175439, + "grad_norm": 28.795856475830078, + "learning_rate": 3.7e-06, + "loss": 0.8966, + "step": 74 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 4.6617937088012695, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7716, + "step": 75 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 15.729666709899902, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.7578, + "step": 76 + }, + { + "epoch": 0.6754385964912281, + "grad_norm": 7.109970569610596, + "learning_rate": 3.85e-06, + "loss": 0.7055, + "step": 77 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 20.84659194946289, + "learning_rate": 3.900000000000001e-06, + "loss": 0.7458, + "step": 78 + }, + { + "epoch": 0.6929824561403509, + "grad_norm": 21.601303100585938, + "learning_rate": 3.95e-06, + "loss": 0.6879, + "step": 79 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 3.6914751529693604, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6179, + "step": 80 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 16.539325714111328, + "learning_rate": 4.05e-06, + "loss": 0.5716, + "step": 81 + }, + { + "epoch": 0.7192982456140351, + "grad_norm": 13.931925773620605, + "learning_rate": 4.1e-06, + "loss": 0.558, + "step": 82 + }, + { + "epoch": 0.7280701754385965, + "grad_norm": 10.52951717376709, + "learning_rate": 4.15e-06, + "loss": 0.6018, + "step": 83 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 17.337060928344727, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.5501, + "step": 84 + }, + { + "epoch": 0.7456140350877193, + "grad_norm": 13.500468254089355, + "learning_rate": 4.25e-06, + "loss": 0.5214, + "step": 85 + }, + { + "epoch": 0.7543859649122807, + "grad_norm": 10.290645599365234, + "learning_rate": 4.3e-06, + "loss": 0.4996, + "step": 86 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 9.757556915283203, + "learning_rate": 4.350000000000001e-06, + "loss": 0.498, + "step": 87 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 9.325140953063965, + "learning_rate": 4.4e-06, + "loss": 0.4721, + "step": 88 + }, + { + "epoch": 0.7807017543859649, + "grad_norm": 2.9322128295898438, + "learning_rate": 4.450000000000001e-06, + "loss": 0.4528, + "step": 89 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 10.484073638916016, + "learning_rate": 4.5e-06, + "loss": 0.445, + "step": 90 + }, + { + "epoch": 0.7982456140350878, + "grad_norm": 32.7827262878418, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.5105, + "step": 91 + }, + { + "epoch": 0.8070175438596491, + "grad_norm": 2.8477306365966797, + "learning_rate": 4.600000000000001e-06, + "loss": 0.4117, + "step": 92 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 2.7680225372314453, + "learning_rate": 4.65e-06, + "loss": 0.3653, + "step": 93 + }, + { + "epoch": 0.8245614035087719, + "grad_norm": 2.6512742042541504, + "learning_rate": 4.7e-06, + "loss": 0.3878, + "step": 94 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 6.453914165496826, + "learning_rate": 4.75e-06, + "loss": 0.3611, + "step": 95 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 3.4594080448150635, + "learning_rate": 4.800000000000001e-06, + "loss": 0.3817, + "step": 96 + }, + { + "epoch": 0.8508771929824561, + "grad_norm": 3.6144917011260986, + "learning_rate": 4.85e-06, + "loss": 0.3618, + "step": 97 + }, + { + "epoch": 0.8596491228070176, + "grad_norm": 5.349407196044922, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.3218, + "step": 98 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 13.671236991882324, + "learning_rate": 4.95e-06, + "loss": 0.3329, + "step": 99 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 5.84046745300293, + "learning_rate": 5e-06, + "loss": 0.2967, + "step": 100 + }, + { + "epoch": 0.8859649122807017, + "grad_norm": 14.005338668823242, + "learning_rate": 4.999963827125897e-06, + "loss": 0.303, + "step": 101 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 9.18114185333252, + "learning_rate": 4.999855309550366e-06, + "loss": 0.2762, + "step": 102 + }, + { + "epoch": 0.9035087719298246, + "grad_norm": 3.0800487995147705, + "learning_rate": 4.999674450413725e-06, + "loss": 0.2628, + "step": 103 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 82.03578186035156, + "learning_rate": 4.999421254949728e-06, + "loss": 0.4065, + "step": 104 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 77.66315460205078, + "learning_rate": 4.99909573048542e-06, + "loss": 0.4307, + "step": 105 + }, + { + "epoch": 0.9298245614035088, + "grad_norm": 18.28767967224121, + "learning_rate": 4.998697886440927e-06, + "loss": 0.2571, + "step": 106 + }, + { + "epoch": 0.9385964912280702, + "grad_norm": 5.960445880889893, + "learning_rate": 4.998227734329177e-06, + "loss": 0.2847, + "step": 107 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 5.437699794769287, + "learning_rate": 4.9976852877555755e-06, + "loss": 0.2728, + "step": 108 + }, + { + "epoch": 0.956140350877193, + "grad_norm": 3.379631280899048, + "learning_rate": 4.997070562417602e-06, + "loss": 0.2467, + "step": 109 + }, + { + "epoch": 0.9649122807017544, + "grad_norm": 3.1625075340270996, + "learning_rate": 4.996383576104362e-06, + "loss": 0.2273, + "step": 110 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 15.588600158691406, + "learning_rate": 4.995624348696071e-06, + "loss": 0.2486, + "step": 111 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 2.631044387817383, + "learning_rate": 4.9947929021634815e-06, + "loss": 0.1964, + "step": 112 + }, + { + "epoch": 0.9912280701754386, + "grad_norm": 4.706504821777344, + "learning_rate": 4.993889260567239e-06, + "loss": 0.1901, + "step": 113 + }, + { + "epoch": 1.0, + "grad_norm": 10.368465423583984, + "learning_rate": 4.9929134500571954e-06, + "loss": 0.1996, + "step": 114 + } + ], + "logging_steps": 1, + "max_steps": 684, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 114, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.45999007414852e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-114/training_args.bin b/checkpoint-114/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..38c27bdabb0e0e68242bce9d9302628a34f6e7cf --- /dev/null +++ b/checkpoint-114/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cb0553c2c3dd5a010aed55eae3afd8bd7f096b43ba03d25af54dc26191426ae +size 7992 diff --git a/checkpoint-114/zero_to_fp32.py b/checkpoint-114/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-114/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-228/README.md b/checkpoint-228/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f4a3934800eeb082a0cb833d7b6af4f68eed3615 --- /dev/null +++ b/checkpoint-228/README.md @@ -0,0 +1,202 @@ +--- +base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-228/adapter_config.json b/checkpoint-228/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be --- /dev/null +++ b/checkpoint-228/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-228/adapter_model.safetensors b/checkpoint-228/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a68b30f953577754becff0c56a3018c6e48f3d1b --- /dev/null +++ b/checkpoint-228/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7595d2ffb7b408a9a2b9933fb1eb962a9e37c2d3c114ce50e86160ff0a1720a2 +size 9016826528 diff --git a/checkpoint-228/global_step228/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-228/global_step228/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..417b082643685d5aac1e14bacc4b52c0adfa670d --- /dev/null +++ b/checkpoint-228/global_step228/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:613a22985902c9d1d1be9c9ce2dab87a331de51fd89da9348dd5f43ec07cd409 +size 27050164444 diff --git a/checkpoint-228/global_step228/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-228/global_step228/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15369f5d428ce41de2b0b4778e7740d3df195dc6 --- /dev/null +++ b/checkpoint-228/global_step228/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5620d8ab8d21b3d10f4b88326c726459339e432ae6fc1dc2205c16dc137fce0e +size 27050169884 diff --git a/checkpoint-228/global_step228/mp_rank_00_model_states.pt b/checkpoint-228/global_step228/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b636fabd7086a0a4fc88eefd869ed6fbdf83123d --- /dev/null +++ b/checkpoint-228/global_step228/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24fb10124c58ab3bc95ccb3b9dcc7a5bdeefd22a99b84346cf3c1df9f76c8c27 +size 9776788601 diff --git a/checkpoint-228/latest b/checkpoint-228/latest new file mode 100644 index 0000000000000000000000000000000000000000..74f667dd5aec7b1dcf458da255b4d04f2e864037 --- /dev/null +++ b/checkpoint-228/latest @@ -0,0 +1 @@ +global_step228 \ No newline at end of file diff --git a/checkpoint-228/rng_state_0.pth b/checkpoint-228/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a5d14df7a6086589916370411c87ca4b0ff67991 --- /dev/null +++ b/checkpoint-228/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df88b39032cfb5865c667e31cb370a479cdab725990452a7f491c7100c7266f +size 14512 diff --git a/checkpoint-228/rng_state_1.pth b/checkpoint-228/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..b56566661121ee55636a0083720baa794abae012 --- /dev/null +++ b/checkpoint-228/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fccaad5cbd19ebb15866094c25b042ca7260a9e174b4a8e2a720bae96eb35fe +size 14512 diff --git a/checkpoint-228/scheduler.pt b/checkpoint-228/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb6c4bace72a7b1c8de145936466d2b1e4a21463 --- /dev/null +++ b/checkpoint-228/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6a932650a4637e48ce03bf2825ccc9a1ed4f05bb0a73538a68ddc440b889a8 +size 1064 diff --git a/checkpoint-228/special_tokens_map.json b/checkpoint-228/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-228/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-228/tokenizer.json b/checkpoint-228/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-228/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-228/tokenizer_config.json b/checkpoint-228/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74 --- /dev/null +++ b/checkpoint-228/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-228/trainer_state.json b/checkpoint-228/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f4b145afc6ada1f7d4c584b2465ea96787544a67 --- /dev/null +++ b/checkpoint-228/trainer_state.json @@ -0,0 +1,1629 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 228, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008771929824561403, + "grad_norm": 39.56407165527344, + "learning_rate": 5.0000000000000004e-08, + "loss": 5.1375, + "step": 1 + }, + { + "epoch": 0.017543859649122806, + "grad_norm": 40.30452346801758, + "learning_rate": 1.0000000000000001e-07, + "loss": 5.1185, + "step": 2 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 40.062313079833984, + "learning_rate": 1.5000000000000002e-07, + "loss": 5.0762, + "step": 3 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 39.17148208618164, + "learning_rate": 2.0000000000000002e-07, + "loss": 5.016, + "step": 4 + }, + { + "epoch": 0.043859649122807015, + "grad_norm": 40.67367172241211, + "learning_rate": 2.5000000000000004e-07, + "loss": 5.0428, + "step": 5 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 38.18095016479492, + "learning_rate": 3.0000000000000004e-07, + "loss": 5.2025, + "step": 6 + }, + { + "epoch": 0.06140350877192982, + "grad_norm": 39.12940979003906, + "learning_rate": 3.5000000000000004e-07, + "loss": 4.9896, + "step": 7 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 38.84568405151367, + "learning_rate": 4.0000000000000003e-07, + "loss": 5.1078, + "step": 8 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 39.38333511352539, + "learning_rate": 4.5000000000000003e-07, + "loss": 5.0808, + "step": 9 + }, + { + "epoch": 0.08771929824561403, + "grad_norm": 39.427650451660156, + "learning_rate": 5.000000000000001e-07, + "loss": 5.0534, + "step": 10 + }, + { + "epoch": 0.09649122807017543, + "grad_norm": 39.29513168334961, + "learning_rate": 5.5e-07, + "loss": 5.058, + "step": 11 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 39.641231536865234, + "learning_rate": 6.000000000000001e-07, + "loss": 5.0317, + "step": 12 + }, + { + "epoch": 0.11403508771929824, + "grad_norm": 37.91259765625, + "learning_rate": 6.5e-07, + "loss": 4.912, + "step": 13 + }, + { + "epoch": 0.12280701754385964, + "grad_norm": 38.203548431396484, + "learning_rate": 7.000000000000001e-07, + "loss": 4.9705, + "step": 14 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 39.15998840332031, + "learning_rate": 7.5e-07, + "loss": 4.6962, + "step": 15 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 37.754669189453125, + "learning_rate": 8.000000000000001e-07, + "loss": 4.6262, + "step": 16 + }, + { + "epoch": 0.14912280701754385, + "grad_norm": 35.871490478515625, + "learning_rate": 8.500000000000001e-07, + "loss": 4.5422, + "step": 17 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 36.16888427734375, + "learning_rate": 9.000000000000001e-07, + "loss": 4.664, + "step": 18 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 33.520118713378906, + "learning_rate": 9.500000000000001e-07, + "loss": 4.4697, + "step": 19 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 30.896282196044922, + "learning_rate": 1.0000000000000002e-06, + "loss": 4.3568, + "step": 20 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 29.944643020629883, + "learning_rate": 1.0500000000000001e-06, + "loss": 4.2269, + "step": 21 + }, + { + "epoch": 0.19298245614035087, + "grad_norm": 25.224485397338867, + "learning_rate": 1.1e-06, + "loss": 4.1272, + "step": 22 + }, + { + "epoch": 0.20175438596491227, + "grad_norm": 24.410480499267578, + "learning_rate": 1.1500000000000002e-06, + "loss": 4.0585, + "step": 23 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 21.480648040771484, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.9472, + "step": 24 + }, + { + "epoch": 0.21929824561403508, + "grad_norm": 20.61946678161621, + "learning_rate": 1.25e-06, + "loss": 3.8879, + "step": 25 + }, + { + "epoch": 0.22807017543859648, + "grad_norm": 19.578271865844727, + "learning_rate": 1.3e-06, + "loss": 3.6783, + "step": 26 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 17.418983459472656, + "learning_rate": 1.3500000000000002e-06, + "loss": 3.6826, + "step": 27 + }, + { + "epoch": 0.24561403508771928, + "grad_norm": 18.160301208496094, + "learning_rate": 1.4000000000000001e-06, + "loss": 3.478, + "step": 28 + }, + { + "epoch": 0.2543859649122807, + "grad_norm": 17.573204040527344, + "learning_rate": 1.45e-06, + "loss": 3.459, + "step": 29 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 17.1265869140625, + "learning_rate": 1.5e-06, + "loss": 3.3999, + "step": 30 + }, + { + "epoch": 0.2719298245614035, + "grad_norm": 15.527145385742188, + "learning_rate": 1.5500000000000002e-06, + "loss": 3.2817, + "step": 31 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 14.773847579956055, + "learning_rate": 1.6000000000000001e-06, + "loss": 3.234, + "step": 32 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 12.039301872253418, + "learning_rate": 1.6500000000000003e-06, + "loss": 3.132, + "step": 33 + }, + { + "epoch": 0.2982456140350877, + "grad_norm": 9.217979431152344, + "learning_rate": 1.7000000000000002e-06, + "loss": 3.0548, + "step": 34 + }, + { + "epoch": 0.30701754385964913, + "grad_norm": 7.575639724731445, + "learning_rate": 1.75e-06, + "loss": 2.9529, + "step": 35 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 7.496004104614258, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.8967, + "step": 36 + }, + { + "epoch": 0.32456140350877194, + "grad_norm": 7.45414924621582, + "learning_rate": 1.85e-06, + "loss": 2.8837, + "step": 37 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 8.555658340454102, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.7473, + "step": 38 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 10.03805160522461, + "learning_rate": 1.9500000000000004e-06, + "loss": 2.7355, + "step": 39 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 9.30649471282959, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.6587, + "step": 40 + }, + { + "epoch": 0.35964912280701755, + "grad_norm": 8.510339736938477, + "learning_rate": 2.05e-06, + "loss": 2.5977, + "step": 41 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 4.709080696105957, + "learning_rate": 2.1000000000000002e-06, + "loss": 2.6286, + "step": 42 + }, + { + "epoch": 0.37719298245614036, + "grad_norm": 5.128961086273193, + "learning_rate": 2.15e-06, + "loss": 2.4558, + "step": 43 + }, + { + "epoch": 0.38596491228070173, + "grad_norm": 5.190136432647705, + "learning_rate": 2.2e-06, + "loss": 2.4432, + "step": 44 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 4.893551349639893, + "learning_rate": 2.25e-06, + "loss": 2.4939, + "step": 45 + }, + { + "epoch": 0.40350877192982454, + "grad_norm": 5.2434983253479, + "learning_rate": 2.3000000000000004e-06, + "loss": 2.3381, + "step": 46 + }, + { + "epoch": 0.41228070175438597, + "grad_norm": 5.122412204742432, + "learning_rate": 2.35e-06, + "loss": 2.313, + "step": 47 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 4.577274799346924, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.2236, + "step": 48 + }, + { + "epoch": 0.4298245614035088, + "grad_norm": 4.722769737243652, + "learning_rate": 2.4500000000000003e-06, + "loss": 2.1987, + "step": 49 + }, + { + "epoch": 0.43859649122807015, + "grad_norm": 5.059235095977783, + "learning_rate": 2.5e-06, + "loss": 2.1415, + "step": 50 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 4.454439640045166, + "learning_rate": 2.55e-06, + "loss": 2.0466, + "step": 51 + }, + { + "epoch": 0.45614035087719296, + "grad_norm": 4.94586706161499, + "learning_rate": 2.6e-06, + "loss": 1.8762, + "step": 52 + }, + { + "epoch": 0.4649122807017544, + "grad_norm": 4.704402446746826, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.8012, + "step": 53 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 6.125903129577637, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.7669, + "step": 54 + }, + { + "epoch": 0.4824561403508772, + "grad_norm": 4.5356059074401855, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.6607, + "step": 55 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 6.56803035736084, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.6291, + "step": 56 + }, + { + "epoch": 0.5, + "grad_norm": 4.910050392150879, + "learning_rate": 2.85e-06, + "loss": 1.5545, + "step": 57 + }, + { + "epoch": 0.5087719298245614, + "grad_norm": 8.733433723449707, + "learning_rate": 2.9e-06, + "loss": 1.4206, + "step": 58 + }, + { + "epoch": 0.5175438596491229, + "grad_norm": 8.582486152648926, + "learning_rate": 2.95e-06, + "loss": 1.3912, + "step": 59 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 13.710689544677734, + "learning_rate": 3e-06, + "loss": 1.3297, + "step": 60 + }, + { + "epoch": 0.5350877192982456, + "grad_norm": 23.400312423706055, + "learning_rate": 3.05e-06, + "loss": 1.296, + "step": 61 + }, + { + "epoch": 0.543859649122807, + "grad_norm": 5.678805351257324, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.2259, + "step": 62 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 14.700899124145508, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.1087, + "step": 63 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 19.38919448852539, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.1805, + "step": 64 + }, + { + "epoch": 0.5701754385964912, + "grad_norm": 8.460039138793945, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.0963, + "step": 65 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 13.371014595031738, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.0627, + "step": 66 + }, + { + "epoch": 0.5877192982456141, + "grad_norm": 22.380569458007812, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0869, + "step": 67 + }, + { + "epoch": 0.5964912280701754, + "grad_norm": 5.780513286590576, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9991, + "step": 68 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 19.850841522216797, + "learning_rate": 3.45e-06, + "loss": 0.9683, + "step": 69 + }, + { + "epoch": 0.6140350877192983, + "grad_norm": 17.160703659057617, + "learning_rate": 3.5e-06, + "loss": 0.845, + "step": 70 + }, + { + "epoch": 0.6228070175438597, + "grad_norm": 14.264311790466309, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8059, + "step": 71 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 26.39459991455078, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.85, + "step": 72 + }, + { + "epoch": 0.6403508771929824, + "grad_norm": 51.10348892211914, + "learning_rate": 3.65e-06, + "loss": 0.9755, + "step": 73 + }, + { + "epoch": 0.6491228070175439, + "grad_norm": 28.795856475830078, + "learning_rate": 3.7e-06, + "loss": 0.8966, + "step": 74 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 4.6617937088012695, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7716, + "step": 75 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 15.729666709899902, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.7578, + "step": 76 + }, + { + "epoch": 0.6754385964912281, + "grad_norm": 7.109970569610596, + "learning_rate": 3.85e-06, + "loss": 0.7055, + "step": 77 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 20.84659194946289, + "learning_rate": 3.900000000000001e-06, + "loss": 0.7458, + "step": 78 + }, + { + "epoch": 0.6929824561403509, + "grad_norm": 21.601303100585938, + "learning_rate": 3.95e-06, + "loss": 0.6879, + "step": 79 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 3.6914751529693604, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6179, + "step": 80 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 16.539325714111328, + "learning_rate": 4.05e-06, + "loss": 0.5716, + "step": 81 + }, + { + "epoch": 0.7192982456140351, + "grad_norm": 13.931925773620605, + "learning_rate": 4.1e-06, + "loss": 0.558, + "step": 82 + }, + { + "epoch": 0.7280701754385965, + "grad_norm": 10.52951717376709, + "learning_rate": 4.15e-06, + "loss": 0.6018, + "step": 83 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 17.337060928344727, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.5501, + "step": 84 + }, + { + "epoch": 0.7456140350877193, + "grad_norm": 13.500468254089355, + "learning_rate": 4.25e-06, + "loss": 0.5214, + "step": 85 + }, + { + "epoch": 0.7543859649122807, + "grad_norm": 10.290645599365234, + "learning_rate": 4.3e-06, + "loss": 0.4996, + "step": 86 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 9.757556915283203, + "learning_rate": 4.350000000000001e-06, + "loss": 0.498, + "step": 87 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 9.325140953063965, + "learning_rate": 4.4e-06, + "loss": 0.4721, + "step": 88 + }, + { + "epoch": 0.7807017543859649, + "grad_norm": 2.9322128295898438, + "learning_rate": 4.450000000000001e-06, + "loss": 0.4528, + "step": 89 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 10.484073638916016, + "learning_rate": 4.5e-06, + "loss": 0.445, + "step": 90 + }, + { + "epoch": 0.7982456140350878, + "grad_norm": 32.7827262878418, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.5105, + "step": 91 + }, + { + "epoch": 0.8070175438596491, + "grad_norm": 2.8477306365966797, + "learning_rate": 4.600000000000001e-06, + "loss": 0.4117, + "step": 92 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 2.7680225372314453, + "learning_rate": 4.65e-06, + "loss": 0.3653, + "step": 93 + }, + { + "epoch": 0.8245614035087719, + "grad_norm": 2.6512742042541504, + "learning_rate": 4.7e-06, + "loss": 0.3878, + "step": 94 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 6.453914165496826, + "learning_rate": 4.75e-06, + "loss": 0.3611, + "step": 95 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 3.4594080448150635, + "learning_rate": 4.800000000000001e-06, + "loss": 0.3817, + "step": 96 + }, + { + "epoch": 0.8508771929824561, + "grad_norm": 3.6144917011260986, + "learning_rate": 4.85e-06, + "loss": 0.3618, + "step": 97 + }, + { + "epoch": 0.8596491228070176, + "grad_norm": 5.349407196044922, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.3218, + "step": 98 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 13.671236991882324, + "learning_rate": 4.95e-06, + "loss": 0.3329, + "step": 99 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 5.84046745300293, + "learning_rate": 5e-06, + "loss": 0.2967, + "step": 100 + }, + { + "epoch": 0.8859649122807017, + "grad_norm": 14.005338668823242, + "learning_rate": 4.999963827125897e-06, + "loss": 0.303, + "step": 101 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 9.18114185333252, + "learning_rate": 4.999855309550366e-06, + "loss": 0.2762, + "step": 102 + }, + { + "epoch": 0.9035087719298246, + "grad_norm": 3.0800487995147705, + "learning_rate": 4.999674450413725e-06, + "loss": 0.2628, + "step": 103 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 82.03578186035156, + "learning_rate": 4.999421254949728e-06, + "loss": 0.4065, + "step": 104 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 77.66315460205078, + "learning_rate": 4.99909573048542e-06, + "loss": 0.4307, + "step": 105 + }, + { + "epoch": 0.9298245614035088, + "grad_norm": 18.28767967224121, + "learning_rate": 4.998697886440927e-06, + "loss": 0.2571, + "step": 106 + }, + { + "epoch": 0.9385964912280702, + "grad_norm": 5.960445880889893, + "learning_rate": 4.998227734329177e-06, + "loss": 0.2847, + "step": 107 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 5.437699794769287, + "learning_rate": 4.9976852877555755e-06, + "loss": 0.2728, + "step": 108 + }, + { + "epoch": 0.956140350877193, + "grad_norm": 3.379631280899048, + "learning_rate": 4.997070562417602e-06, + "loss": 0.2467, + "step": 109 + }, + { + "epoch": 0.9649122807017544, + "grad_norm": 3.1625075340270996, + "learning_rate": 4.996383576104362e-06, + "loss": 0.2273, + "step": 110 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 15.588600158691406, + "learning_rate": 4.995624348696071e-06, + "loss": 0.2486, + "step": 111 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 2.631044387817383, + "learning_rate": 4.9947929021634815e-06, + "loss": 0.1964, + "step": 112 + }, + { + "epoch": 0.9912280701754386, + "grad_norm": 4.706504821777344, + "learning_rate": 4.993889260567239e-06, + "loss": 0.1901, + "step": 113 + }, + { + "epoch": 1.0, + "grad_norm": 10.368465423583984, + "learning_rate": 4.9929134500571954e-06, + "loss": 0.1996, + "step": 114 + }, + { + "epoch": 1.0087719298245614, + "grad_norm": 30.44986343383789, + "learning_rate": 4.991865498871647e-06, + "loss": 0.2606, + "step": 115 + }, + { + "epoch": 1.0175438596491229, + "grad_norm": 14.421515464782715, + "learning_rate": 4.99074543733652e-06, + "loss": 0.2394, + "step": 116 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 14.072005271911621, + "learning_rate": 4.989553297864489e-06, + "loss": 0.2288, + "step": 117 + }, + { + "epoch": 1.0350877192982457, + "grad_norm": 4.395325660705566, + "learning_rate": 4.988289114954045e-06, + "loss": 0.2129, + "step": 118 + }, + { + "epoch": 1.043859649122807, + "grad_norm": 7.286703586578369, + "learning_rate": 4.986952925188489e-06, + "loss": 0.186, + "step": 119 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 8.332784652709961, + "learning_rate": 4.98554476723488e-06, + "loss": 0.178, + "step": 120 + }, + { + "epoch": 1.0614035087719298, + "grad_norm": 1.3646447658538818, + "learning_rate": 4.984064681842917e-06, + "loss": 0.1687, + "step": 121 + }, + { + "epoch": 1.0701754385964912, + "grad_norm": 4.494940757751465, + "learning_rate": 4.982512711843753e-06, + "loss": 0.1881, + "step": 122 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 3.3929836750030518, + "learning_rate": 4.980888902148757e-06, + "loss": 0.1764, + "step": 123 + }, + { + "epoch": 1.087719298245614, + "grad_norm": 1.8281155824661255, + "learning_rate": 4.979193299748225e-06, + "loss": 0.1602, + "step": 124 + }, + { + "epoch": 1.0964912280701755, + "grad_norm": 3.494239568710327, + "learning_rate": 4.977425953710005e-06, + "loss": 0.1729, + "step": 125 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 1.500410556793213, + "learning_rate": 4.975586915178084e-06, + "loss": 0.1666, + "step": 126 + }, + { + "epoch": 1.1140350877192982, + "grad_norm": 1.4680222272872925, + "learning_rate": 4.973676237371111e-06, + "loss": 0.159, + "step": 127 + }, + { + "epoch": 1.1228070175438596, + "grad_norm": 3.0383460521698, + "learning_rate": 4.971693975580851e-06, + "loss": 0.1484, + "step": 128 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 3.74821138381958, + "learning_rate": 4.969640187170591e-06, + "loss": 0.1586, + "step": 129 + }, + { + "epoch": 1.1403508771929824, + "grad_norm": 4.682602405548096, + "learning_rate": 4.967514931573473e-06, + "loss": 0.1619, + "step": 130 + }, + { + "epoch": 1.1491228070175439, + "grad_norm": 3.90673565864563, + "learning_rate": 4.965318270290779e-06, + "loss": 0.164, + "step": 131 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 2.2017388343811035, + "learning_rate": 4.963050266890152e-06, + "loss": 0.1499, + "step": 132 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 2.4211816787719727, + "learning_rate": 4.960710987003753e-06, + "loss": 0.1387, + "step": 133 + }, + { + "epoch": 1.1754385964912282, + "grad_norm": 1.7753759622573853, + "learning_rate": 4.958300498326363e-06, + "loss": 0.1441, + "step": 134 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 1.5529910326004028, + "learning_rate": 4.955818870613425e-06, + "loss": 0.1304, + "step": 135 + }, + { + "epoch": 1.1929824561403508, + "grad_norm": 2.090593099594116, + "learning_rate": 4.953266175679023e-06, + "loss": 0.1419, + "step": 136 + }, + { + "epoch": 1.2017543859649122, + "grad_norm": 2.7141878604888916, + "learning_rate": 4.95064248739381e-06, + "loss": 0.1444, + "step": 137 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 2.3690481185913086, + "learning_rate": 4.947947881682861e-06, + "loss": 0.1383, + "step": 138 + }, + { + "epoch": 1.219298245614035, + "grad_norm": 2.2403147220611572, + "learning_rate": 4.945182436523482e-06, + "loss": 0.1418, + "step": 139 + }, + { + "epoch": 1.2280701754385965, + "grad_norm": 1.3939160108566284, + "learning_rate": 4.942346231942955e-06, + "loss": 0.1307, + "step": 140 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 11.276732444763184, + "learning_rate": 4.939439350016214e-06, + "loss": 0.1397, + "step": 141 + }, + { + "epoch": 1.2456140350877192, + "grad_norm": 8.260516166687012, + "learning_rate": 4.9364618748634794e-06, + "loss": 0.1426, + "step": 142 + }, + { + "epoch": 1.2543859649122808, + "grad_norm": 2.09720516204834, + "learning_rate": 4.933413892647819e-06, + "loss": 0.1323, + "step": 143 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.802125334739685, + "learning_rate": 4.9302954915726535e-06, + "loss": 0.1304, + "step": 144 + }, + { + "epoch": 1.2719298245614035, + "grad_norm": 1.7151471376419067, + "learning_rate": 4.927106761879207e-06, + "loss": 0.1264, + "step": 145 + }, + { + "epoch": 1.280701754385965, + "grad_norm": 1.6970336437225342, + "learning_rate": 4.923847795843894e-06, + "loss": 0.1227, + "step": 146 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 16.60441017150879, + "learning_rate": 4.920518687775647e-06, + "loss": 0.1606, + "step": 147 + }, + { + "epoch": 1.2982456140350878, + "grad_norm": 6.470354080200195, + "learning_rate": 4.917119534013194e-06, + "loss": 0.1447, + "step": 148 + }, + { + "epoch": 1.3070175438596492, + "grad_norm": 1.4908231496810913, + "learning_rate": 4.913650432922264e-06, + "loss": 0.1343, + "step": 149 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 3.19964861869812, + "learning_rate": 4.91011148489274e-06, + "loss": 0.1354, + "step": 150 + }, + { + "epoch": 1.3245614035087718, + "grad_norm": 2.6052839756011963, + "learning_rate": 4.906502792335761e-06, + "loss": 0.1342, + "step": 151 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.0719165802001953, + "learning_rate": 4.9028244596807525e-06, + "loss": 0.1359, + "step": 152 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.8086919784545898, + "learning_rate": 4.899076593372405e-06, + "loss": 0.1279, + "step": 153 + }, + { + "epoch": 1.3508771929824561, + "grad_norm": 1.0056848526000977, + "learning_rate": 4.8952593018675955e-06, + "loss": 0.1162, + "step": 154 + }, + { + "epoch": 1.3596491228070176, + "grad_norm": 5.72553014755249, + "learning_rate": 4.891372695632249e-06, + "loss": 0.1315, + "step": 155 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.522894024848938, + "learning_rate": 4.887416887138139e-06, + "loss": 0.1266, + "step": 156 + }, + { + "epoch": 1.3771929824561404, + "grad_norm": 2.019472122192383, + "learning_rate": 4.883391990859635e-06, + "loss": 0.1262, + "step": 157 + }, + { + "epoch": 1.3859649122807016, + "grad_norm": 1.8594422340393066, + "learning_rate": 4.879298123270391e-06, + "loss": 0.125, + "step": 158 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 1.365377426147461, + "learning_rate": 4.8751354028399725e-06, + "loss": 0.1218, + "step": 159 + }, + { + "epoch": 1.4035087719298245, + "grad_norm": 3.553309917449951, + "learning_rate": 4.870903950030429e-06, + "loss": 0.1272, + "step": 160 + }, + { + "epoch": 1.412280701754386, + "grad_norm": 2.1770920753479004, + "learning_rate": 4.866603887292809e-06, + "loss": 0.1213, + "step": 161 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 1.6058955192565918, + "learning_rate": 4.862235339063613e-06, + "loss": 0.1173, + "step": 162 + }, + { + "epoch": 1.4298245614035088, + "grad_norm": 1.3208314180374146, + "learning_rate": 4.857798431761199e-06, + "loss": 0.1183, + "step": 163 + }, + { + "epoch": 1.4385964912280702, + "grad_norm": 1.282729983329773, + "learning_rate": 4.853293293782118e-06, + "loss": 0.1209, + "step": 164 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 1.3838152885437012, + "learning_rate": 4.848720055497401e-06, + "loss": 0.1198, + "step": 165 + }, + { + "epoch": 1.456140350877193, + "grad_norm": 1.2930737733840942, + "learning_rate": 4.844078849248785e-06, + "loss": 0.1268, + "step": 166 + }, + { + "epoch": 1.4649122807017543, + "grad_norm": 1.7022266387939453, + "learning_rate": 4.839369809344888e-06, + "loss": 0.1198, + "step": 167 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.0927815437316895, + "learning_rate": 4.834593072057313e-06, + "loss": 0.1132, + "step": 168 + }, + { + "epoch": 1.4824561403508771, + "grad_norm": 0.9326333999633789, + "learning_rate": 4.829748775616716e-06, + "loss": 0.1193, + "step": 169 + }, + { + "epoch": 1.4912280701754386, + "grad_norm": 1.3564742803573608, + "learning_rate": 4.8248370602087954e-06, + "loss": 0.118, + "step": 170 + }, + { + "epoch": 1.5, + "grad_norm": 1.19778573513031, + "learning_rate": 4.819858067970243e-06, + "loss": 0.1122, + "step": 171 + }, + { + "epoch": 1.5087719298245614, + "grad_norm": 2.8438351154327393, + "learning_rate": 4.814811942984625e-06, + "loss": 0.1217, + "step": 172 + }, + { + "epoch": 1.5175438596491229, + "grad_norm": 1.0701063871383667, + "learning_rate": 4.809698831278217e-06, + "loss": 0.1114, + "step": 173 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.9053553938865662, + "learning_rate": 4.804518880815776e-06, + "loss": 0.1178, + "step": 174 + }, + { + "epoch": 1.5350877192982457, + "grad_norm": 0.42274603247642517, + "learning_rate": 4.799272241496259e-06, + "loss": 0.1091, + "step": 175 + }, + { + "epoch": 1.543859649122807, + "grad_norm": 0.8576470017433167, + "learning_rate": 4.793959065148484e-06, + "loss": 0.1134, + "step": 176 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.5910662412643433, + "learning_rate": 4.78857950552674e-06, + "loss": 0.1148, + "step": 177 + }, + { + "epoch": 1.5614035087719298, + "grad_norm": 0.8761632442474365, + "learning_rate": 4.783133718306331e-06, + "loss": 0.1125, + "step": 178 + }, + { + "epoch": 1.5701754385964912, + "grad_norm": 1.9190795421600342, + "learning_rate": 4.777621861079079e-06, + "loss": 0.1148, + "step": 179 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.6199957728385925, + "learning_rate": 4.772044093348757e-06, + "loss": 0.1097, + "step": 180 + }, + { + "epoch": 1.587719298245614, + "grad_norm": 1.562089443206787, + "learning_rate": 4.766400576526479e-06, + "loss": 0.1097, + "step": 181 + }, + { + "epoch": 1.5964912280701755, + "grad_norm": 1.4957091808319092, + "learning_rate": 4.760691473926021e-06, + "loss": 0.1216, + "step": 182 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.9863570332527161, + "learning_rate": 4.754916950759105e-06, + "loss": 0.1122, + "step": 183 + }, + { + "epoch": 1.6140350877192984, + "grad_norm": 0.5803346633911133, + "learning_rate": 4.749077174130609e-06, + "loss": 0.1103, + "step": 184 + }, + { + "epoch": 1.6228070175438596, + "grad_norm": 1.8789891004562378, + "learning_rate": 4.743172313033738e-06, + "loss": 0.1191, + "step": 185 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.8731380105018616, + "learning_rate": 4.7372025383451285e-06, + "loss": 0.1154, + "step": 186 + }, + { + "epoch": 1.6403508771929824, + "grad_norm": 1.3535627126693726, + "learning_rate": 4.7311680228199075e-06, + "loss": 0.1123, + "step": 187 + }, + { + "epoch": 1.6491228070175439, + "grad_norm": 0.7211089134216309, + "learning_rate": 4.725068941086693e-06, + "loss": 0.1134, + "step": 188 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 1.4752328395843506, + "learning_rate": 4.718905469642534e-06, + "loss": 0.1185, + "step": 189 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.9822680354118347, + "learning_rate": 4.712677786847814e-06, + "loss": 0.1146, + "step": 190 + }, + { + "epoch": 1.6754385964912282, + "grad_norm": 1.1308330297470093, + "learning_rate": 4.706386072921083e-06, + "loss": 0.1061, + "step": 191 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 5.331939697265625, + "learning_rate": 4.70003050993384e-06, + "loss": 0.1153, + "step": 192 + }, + { + "epoch": 1.692982456140351, + "grad_norm": 0.6911673545837402, + "learning_rate": 4.6936112818052674e-06, + "loss": 0.1098, + "step": 193 + }, + { + "epoch": 1.7017543859649122, + "grad_norm": 0.5160980224609375, + "learning_rate": 4.687128574296912e-06, + "loss": 0.1073, + "step": 194 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 1.5724798440933228, + "learning_rate": 4.680582575007303e-06, + "loss": 0.121, + "step": 195 + }, + { + "epoch": 1.719298245614035, + "grad_norm": 1.3960011005401611, + "learning_rate": 4.6739734733665275e-06, + "loss": 0.1145, + "step": 196 + }, + { + "epoch": 1.7280701754385965, + "grad_norm": 1.4949183464050293, + "learning_rate": 4.6673014606307465e-06, + "loss": 0.1166, + "step": 197 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 1.6873422861099243, + "learning_rate": 4.660566729876661e-06, + "loss": 0.1115, + "step": 198 + }, + { + "epoch": 1.7456140350877192, + "grad_norm": 1.3443641662597656, + "learning_rate": 4.653769475995926e-06, + "loss": 0.1119, + "step": 199 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 0.807525098323822, + "learning_rate": 4.646909895689508e-06, + "loss": 0.1059, + "step": 200 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 1.589316964149475, + "learning_rate": 4.639988187461995e-06, + "loss": 0.1151, + "step": 201 + }, + { + "epoch": 1.7719298245614035, + "grad_norm": 2.474756956100464, + "learning_rate": 4.633004551615851e-06, + "loss": 0.116, + "step": 202 + }, + { + "epoch": 1.780701754385965, + "grad_norm": 0.6210195422172546, + "learning_rate": 4.62595919024562e-06, + "loss": 0.1097, + "step": 203 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.7217905521392822, + "learning_rate": 4.618852307232078e-06, + "loss": 0.1117, + "step": 204 + }, + { + "epoch": 1.7982456140350878, + "grad_norm": 1.551251769065857, + "learning_rate": 4.611684108236334e-06, + "loss": 0.113, + "step": 205 + }, + { + "epoch": 1.807017543859649, + "grad_norm": 0.6619828939437866, + "learning_rate": 4.604454800693874e-06, + "loss": 0.113, + "step": 206 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.9461805820465088, + "learning_rate": 4.597164593808564e-06, + "loss": 0.1093, + "step": 207 + }, + { + "epoch": 1.8245614035087718, + "grad_norm": 1.2926547527313232, + "learning_rate": 4.589813698546592e-06, + "loss": 0.1128, + "step": 208 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.8754212856292725, + "learning_rate": 4.582402327630368e-06, + "loss": 0.1104, + "step": 209 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.846051812171936, + "learning_rate": 4.574930695532357e-06, + "loss": 0.1105, + "step": 210 + }, + { + "epoch": 1.8508771929824561, + "grad_norm": 1.3332515954971313, + "learning_rate": 4.567399018468889e-06, + "loss": 0.1101, + "step": 211 + }, + { + "epoch": 1.8596491228070176, + "grad_norm": 0.8729192614555359, + "learning_rate": 4.5598075143938855e-06, + "loss": 0.1081, + "step": 212 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.8618345260620117, + "learning_rate": 4.552156402992567e-06, + "loss": 0.1059, + "step": 213 + }, + { + "epoch": 1.8771929824561404, + "grad_norm": 1.2135930061340332, + "learning_rate": 4.544445905675082e-06, + "loss": 0.1105, + "step": 214 + }, + { + "epoch": 1.8859649122807016, + "grad_norm": 0.8405666351318359, + "learning_rate": 4.536676245570111e-06, + "loss": 0.1118, + "step": 215 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.42860639095306396, + "learning_rate": 4.528847647518403e-06, + "loss": 0.1093, + "step": 216 + }, + { + "epoch": 1.9035087719298245, + "grad_norm": 1.1538206338882446, + "learning_rate": 4.520960338066271e-06, + "loss": 0.1088, + "step": 217 + }, + { + "epoch": 1.912280701754386, + "grad_norm": 0.5870749354362488, + "learning_rate": 4.513014545459038e-06, + "loss": 0.1061, + "step": 218 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.7279748916625977, + "learning_rate": 4.505010499634427e-06, + "loss": 0.1032, + "step": 219 + }, + { + "epoch": 1.9298245614035088, + "grad_norm": 0.6331414580345154, + "learning_rate": 4.4969484322159125e-06, + "loss": 0.1109, + "step": 220 + }, + { + "epoch": 1.9385964912280702, + "grad_norm": 0.9024543166160583, + "learning_rate": 4.488828576506014e-06, + "loss": 0.1094, + "step": 221 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 3.540376901626587, + "learning_rate": 4.480651167479545e-06, + "loss": 0.1154, + "step": 222 + }, + { + "epoch": 1.956140350877193, + "grad_norm": 0.9506739377975464, + "learning_rate": 4.472416441776817e-06, + "loss": 0.108, + "step": 223 + }, + { + "epoch": 1.9649122807017543, + "grad_norm": 0.6585081815719604, + "learning_rate": 4.464124637696786e-06, + "loss": 0.1033, + "step": 224 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 1.143038034439087, + "learning_rate": 4.455775995190161e-06, + "loss": 0.1092, + "step": 225 + }, + { + "epoch": 1.9824561403508771, + "grad_norm": 1.148261547088623, + "learning_rate": 4.4473707558524555e-06, + "loss": 0.1076, + "step": 226 + }, + { + "epoch": 1.9912280701754386, + "grad_norm": 0.7375811338424683, + "learning_rate": 4.438909162917003e-06, + "loss": 0.108, + "step": 227 + }, + { + "epoch": 2.0, + "grad_norm": 0.5254591703414917, + "learning_rate": 4.430391461247911e-06, + "loss": 0.1079, + "step": 228 + } + ], + "logging_steps": 1, + "max_steps": 684, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 114, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.91998014829704e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-228/training_args.bin b/checkpoint-228/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..38c27bdabb0e0e68242bce9d9302628a34f6e7cf --- /dev/null +++ b/checkpoint-228/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cb0553c2c3dd5a010aed55eae3afd8bd7f096b43ba03d25af54dc26191426ae +size 7992 diff --git a/checkpoint-228/zero_to_fp32.py b/checkpoint-228/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-228/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-342/README.md b/checkpoint-342/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f4a3934800eeb082a0cb833d7b6af4f68eed3615 --- /dev/null +++ b/checkpoint-342/README.md @@ -0,0 +1,202 @@ +--- +base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-342/adapter_config.json b/checkpoint-342/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be --- /dev/null +++ b/checkpoint-342/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-342/adapter_model.safetensors b/checkpoint-342/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..71b98dd552aafec16a39e9cc46bf6dce29a76e35 --- /dev/null +++ b/checkpoint-342/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52db36c2e6cb4cc680eba88475f97c12ef838ab20fcdeb613dada0e649dacc33 +size 9016826528 diff --git a/checkpoint-342/global_step342/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-342/global_step342/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..70ef0d03cd0cb866abbc958a3726be6d37928544 --- /dev/null +++ b/checkpoint-342/global_step342/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9a50d0ceb927cbc242628a18b8ea961fd93acc184410f4f156e60aaa6269580 +size 27050164444 diff --git a/checkpoint-342/global_step342/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-342/global_step342/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c163c88be363e8cbaebe6d6a0db05641f5175ca --- /dev/null +++ b/checkpoint-342/global_step342/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:017dce42a9b2359d7b2aa38ef8c0c032092cf71f07d0bf4a3759737a9cdbe71f +size 27050169884 diff --git a/checkpoint-342/global_step342/mp_rank_00_model_states.pt b/checkpoint-342/global_step342/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..040fab0a231893ace089247e1bb3f85f57e0c661 --- /dev/null +++ b/checkpoint-342/global_step342/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f2b430da1dda443608e385aaef5d7aa3763e73b3695ed2e293eacf8e7009fed +size 9776788601 diff --git a/checkpoint-342/latest b/checkpoint-342/latest new file mode 100644 index 0000000000000000000000000000000000000000..c865948ad34ed67e3b6a2d0505df96492e4bcc82 --- /dev/null +++ b/checkpoint-342/latest @@ -0,0 +1 @@ +global_step342 \ No newline at end of file diff --git a/checkpoint-342/rng_state_0.pth b/checkpoint-342/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c437a4fe2d46cd991229eb636f65c53484183ee --- /dev/null +++ b/checkpoint-342/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78ece40f5d1a720cb25857302767813fd74736b0b26d2e81bc81a7aad3a91d1c +size 14512 diff --git a/checkpoint-342/rng_state_1.pth b/checkpoint-342/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d858d57424a138ea07f769788e2868d8cbd1b1e7 --- /dev/null +++ b/checkpoint-342/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3acd28755d11d91e6050ba9d039c96c56fa63aa16b6394139525740a1c647f23 +size 14512 diff --git a/checkpoint-342/scheduler.pt b/checkpoint-342/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d33e9c5460dd802e989edef8d2c82a0237bcc7c --- /dev/null +++ b/checkpoint-342/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7651f6a2b549eb3de066bc2352d6529046a6983d2871e2b4d4fb602cb7961725 +size 1064 diff --git a/checkpoint-342/special_tokens_map.json b/checkpoint-342/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-342/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-342/tokenizer.json b/checkpoint-342/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-342/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-342/tokenizer_config.json b/checkpoint-342/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74 --- /dev/null +++ b/checkpoint-342/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-342/trainer_state.json b/checkpoint-342/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f8b4161f89c5188c42309dd8b6e2cfcb53138aa9 --- /dev/null +++ b/checkpoint-342/trainer_state.json @@ -0,0 +1,2427 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 342, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008771929824561403, + "grad_norm": 39.56407165527344, + "learning_rate": 5.0000000000000004e-08, + "loss": 5.1375, + "step": 1 + }, + { + "epoch": 0.017543859649122806, + "grad_norm": 40.30452346801758, + "learning_rate": 1.0000000000000001e-07, + "loss": 5.1185, + "step": 2 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 40.062313079833984, + "learning_rate": 1.5000000000000002e-07, + "loss": 5.0762, + "step": 3 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 39.17148208618164, + "learning_rate": 2.0000000000000002e-07, + "loss": 5.016, + "step": 4 + }, + { + "epoch": 0.043859649122807015, + "grad_norm": 40.67367172241211, + "learning_rate": 2.5000000000000004e-07, + "loss": 5.0428, + "step": 5 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 38.18095016479492, + "learning_rate": 3.0000000000000004e-07, + "loss": 5.2025, + "step": 6 + }, + { + "epoch": 0.06140350877192982, + "grad_norm": 39.12940979003906, + "learning_rate": 3.5000000000000004e-07, + "loss": 4.9896, + "step": 7 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 38.84568405151367, + "learning_rate": 4.0000000000000003e-07, + "loss": 5.1078, + "step": 8 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 39.38333511352539, + "learning_rate": 4.5000000000000003e-07, + "loss": 5.0808, + "step": 9 + }, + { + "epoch": 0.08771929824561403, + "grad_norm": 39.427650451660156, + "learning_rate": 5.000000000000001e-07, + "loss": 5.0534, + "step": 10 + }, + { + "epoch": 0.09649122807017543, + "grad_norm": 39.29513168334961, + "learning_rate": 5.5e-07, + "loss": 5.058, + "step": 11 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 39.641231536865234, + "learning_rate": 6.000000000000001e-07, + "loss": 5.0317, + "step": 12 + }, + { + "epoch": 0.11403508771929824, + "grad_norm": 37.91259765625, + "learning_rate": 6.5e-07, + "loss": 4.912, + "step": 13 + }, + { + "epoch": 0.12280701754385964, + "grad_norm": 38.203548431396484, + "learning_rate": 7.000000000000001e-07, + "loss": 4.9705, + "step": 14 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 39.15998840332031, + "learning_rate": 7.5e-07, + "loss": 4.6962, + "step": 15 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 37.754669189453125, + "learning_rate": 8.000000000000001e-07, + "loss": 4.6262, + "step": 16 + }, + { + "epoch": 0.14912280701754385, + "grad_norm": 35.871490478515625, + "learning_rate": 8.500000000000001e-07, + "loss": 4.5422, + "step": 17 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 36.16888427734375, + "learning_rate": 9.000000000000001e-07, + "loss": 4.664, + "step": 18 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 33.520118713378906, + "learning_rate": 9.500000000000001e-07, + "loss": 4.4697, + "step": 19 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 30.896282196044922, + "learning_rate": 1.0000000000000002e-06, + "loss": 4.3568, + "step": 20 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 29.944643020629883, + "learning_rate": 1.0500000000000001e-06, + "loss": 4.2269, + "step": 21 + }, + { + "epoch": 0.19298245614035087, + "grad_norm": 25.224485397338867, + "learning_rate": 1.1e-06, + "loss": 4.1272, + "step": 22 + }, + { + "epoch": 0.20175438596491227, + "grad_norm": 24.410480499267578, + "learning_rate": 1.1500000000000002e-06, + "loss": 4.0585, + "step": 23 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 21.480648040771484, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.9472, + "step": 24 + }, + { + "epoch": 0.21929824561403508, + "grad_norm": 20.61946678161621, + "learning_rate": 1.25e-06, + "loss": 3.8879, + "step": 25 + }, + { + "epoch": 0.22807017543859648, + "grad_norm": 19.578271865844727, + "learning_rate": 1.3e-06, + "loss": 3.6783, + "step": 26 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 17.418983459472656, + "learning_rate": 1.3500000000000002e-06, + "loss": 3.6826, + "step": 27 + }, + { + "epoch": 0.24561403508771928, + "grad_norm": 18.160301208496094, + "learning_rate": 1.4000000000000001e-06, + "loss": 3.478, + "step": 28 + }, + { + "epoch": 0.2543859649122807, + "grad_norm": 17.573204040527344, + "learning_rate": 1.45e-06, + "loss": 3.459, + "step": 29 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 17.1265869140625, + "learning_rate": 1.5e-06, + "loss": 3.3999, + "step": 30 + }, + { + "epoch": 0.2719298245614035, + "grad_norm": 15.527145385742188, + "learning_rate": 1.5500000000000002e-06, + "loss": 3.2817, + "step": 31 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 14.773847579956055, + "learning_rate": 1.6000000000000001e-06, + "loss": 3.234, + "step": 32 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 12.039301872253418, + "learning_rate": 1.6500000000000003e-06, + "loss": 3.132, + "step": 33 + }, + { + "epoch": 0.2982456140350877, + "grad_norm": 9.217979431152344, + "learning_rate": 1.7000000000000002e-06, + "loss": 3.0548, + "step": 34 + }, + { + "epoch": 0.30701754385964913, + "grad_norm": 7.575639724731445, + "learning_rate": 1.75e-06, + "loss": 2.9529, + "step": 35 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 7.496004104614258, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.8967, + "step": 36 + }, + { + "epoch": 0.32456140350877194, + "grad_norm": 7.45414924621582, + "learning_rate": 1.85e-06, + "loss": 2.8837, + "step": 37 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 8.555658340454102, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.7473, + "step": 38 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 10.03805160522461, + "learning_rate": 1.9500000000000004e-06, + "loss": 2.7355, + "step": 39 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 9.30649471282959, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.6587, + "step": 40 + }, + { + "epoch": 0.35964912280701755, + "grad_norm": 8.510339736938477, + "learning_rate": 2.05e-06, + "loss": 2.5977, + "step": 41 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 4.709080696105957, + "learning_rate": 2.1000000000000002e-06, + "loss": 2.6286, + "step": 42 + }, + { + "epoch": 0.37719298245614036, + "grad_norm": 5.128961086273193, + "learning_rate": 2.15e-06, + "loss": 2.4558, + "step": 43 + }, + { + "epoch": 0.38596491228070173, + "grad_norm": 5.190136432647705, + "learning_rate": 2.2e-06, + "loss": 2.4432, + "step": 44 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 4.893551349639893, + "learning_rate": 2.25e-06, + "loss": 2.4939, + "step": 45 + }, + { + "epoch": 0.40350877192982454, + "grad_norm": 5.2434983253479, + "learning_rate": 2.3000000000000004e-06, + "loss": 2.3381, + "step": 46 + }, + { + "epoch": 0.41228070175438597, + "grad_norm": 5.122412204742432, + "learning_rate": 2.35e-06, + "loss": 2.313, + "step": 47 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 4.577274799346924, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.2236, + "step": 48 + }, + { + "epoch": 0.4298245614035088, + "grad_norm": 4.722769737243652, + "learning_rate": 2.4500000000000003e-06, + "loss": 2.1987, + "step": 49 + }, + { + "epoch": 0.43859649122807015, + "grad_norm": 5.059235095977783, + "learning_rate": 2.5e-06, + "loss": 2.1415, + "step": 50 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 4.454439640045166, + "learning_rate": 2.55e-06, + "loss": 2.0466, + "step": 51 + }, + { + "epoch": 0.45614035087719296, + "grad_norm": 4.94586706161499, + "learning_rate": 2.6e-06, + "loss": 1.8762, + "step": 52 + }, + { + "epoch": 0.4649122807017544, + "grad_norm": 4.704402446746826, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.8012, + "step": 53 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 6.125903129577637, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.7669, + "step": 54 + }, + { + "epoch": 0.4824561403508772, + "grad_norm": 4.5356059074401855, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.6607, + "step": 55 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 6.56803035736084, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.6291, + "step": 56 + }, + { + "epoch": 0.5, + "grad_norm": 4.910050392150879, + "learning_rate": 2.85e-06, + "loss": 1.5545, + "step": 57 + }, + { + "epoch": 0.5087719298245614, + "grad_norm": 8.733433723449707, + "learning_rate": 2.9e-06, + "loss": 1.4206, + "step": 58 + }, + { + "epoch": 0.5175438596491229, + "grad_norm": 8.582486152648926, + "learning_rate": 2.95e-06, + "loss": 1.3912, + "step": 59 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 13.710689544677734, + "learning_rate": 3e-06, + "loss": 1.3297, + "step": 60 + }, + { + "epoch": 0.5350877192982456, + "grad_norm": 23.400312423706055, + "learning_rate": 3.05e-06, + "loss": 1.296, + "step": 61 + }, + { + "epoch": 0.543859649122807, + "grad_norm": 5.678805351257324, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.2259, + "step": 62 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 14.700899124145508, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.1087, + "step": 63 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 19.38919448852539, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.1805, + "step": 64 + }, + { + "epoch": 0.5701754385964912, + "grad_norm": 8.460039138793945, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.0963, + "step": 65 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 13.371014595031738, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.0627, + "step": 66 + }, + { + "epoch": 0.5877192982456141, + "grad_norm": 22.380569458007812, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0869, + "step": 67 + }, + { + "epoch": 0.5964912280701754, + "grad_norm": 5.780513286590576, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9991, + "step": 68 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 19.850841522216797, + "learning_rate": 3.45e-06, + "loss": 0.9683, + "step": 69 + }, + { + "epoch": 0.6140350877192983, + "grad_norm": 17.160703659057617, + "learning_rate": 3.5e-06, + "loss": 0.845, + "step": 70 + }, + { + "epoch": 0.6228070175438597, + "grad_norm": 14.264311790466309, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8059, + "step": 71 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 26.39459991455078, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.85, + "step": 72 + }, + { + "epoch": 0.6403508771929824, + "grad_norm": 51.10348892211914, + "learning_rate": 3.65e-06, + "loss": 0.9755, + "step": 73 + }, + { + "epoch": 0.6491228070175439, + "grad_norm": 28.795856475830078, + "learning_rate": 3.7e-06, + "loss": 0.8966, + "step": 74 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 4.6617937088012695, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7716, + "step": 75 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 15.729666709899902, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.7578, + "step": 76 + }, + { + "epoch": 0.6754385964912281, + "grad_norm": 7.109970569610596, + "learning_rate": 3.85e-06, + "loss": 0.7055, + "step": 77 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 20.84659194946289, + "learning_rate": 3.900000000000001e-06, + "loss": 0.7458, + "step": 78 + }, + { + "epoch": 0.6929824561403509, + "grad_norm": 21.601303100585938, + "learning_rate": 3.95e-06, + "loss": 0.6879, + "step": 79 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 3.6914751529693604, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6179, + "step": 80 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 16.539325714111328, + "learning_rate": 4.05e-06, + "loss": 0.5716, + "step": 81 + }, + { + "epoch": 0.7192982456140351, + "grad_norm": 13.931925773620605, + "learning_rate": 4.1e-06, + "loss": 0.558, + "step": 82 + }, + { + "epoch": 0.7280701754385965, + "grad_norm": 10.52951717376709, + "learning_rate": 4.15e-06, + "loss": 0.6018, + "step": 83 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 17.337060928344727, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.5501, + "step": 84 + }, + { + "epoch": 0.7456140350877193, + "grad_norm": 13.500468254089355, + "learning_rate": 4.25e-06, + "loss": 0.5214, + "step": 85 + }, + { + "epoch": 0.7543859649122807, + "grad_norm": 10.290645599365234, + "learning_rate": 4.3e-06, + "loss": 0.4996, + "step": 86 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 9.757556915283203, + "learning_rate": 4.350000000000001e-06, + "loss": 0.498, + "step": 87 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 9.325140953063965, + "learning_rate": 4.4e-06, + "loss": 0.4721, + "step": 88 + }, + { + "epoch": 0.7807017543859649, + "grad_norm": 2.9322128295898438, + "learning_rate": 4.450000000000001e-06, + "loss": 0.4528, + "step": 89 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 10.484073638916016, + "learning_rate": 4.5e-06, + "loss": 0.445, + "step": 90 + }, + { + "epoch": 0.7982456140350878, + "grad_norm": 32.7827262878418, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.5105, + "step": 91 + }, + { + "epoch": 0.8070175438596491, + "grad_norm": 2.8477306365966797, + "learning_rate": 4.600000000000001e-06, + "loss": 0.4117, + "step": 92 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 2.7680225372314453, + "learning_rate": 4.65e-06, + "loss": 0.3653, + "step": 93 + }, + { + "epoch": 0.8245614035087719, + "grad_norm": 2.6512742042541504, + "learning_rate": 4.7e-06, + "loss": 0.3878, + "step": 94 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 6.453914165496826, + "learning_rate": 4.75e-06, + "loss": 0.3611, + "step": 95 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 3.4594080448150635, + "learning_rate": 4.800000000000001e-06, + "loss": 0.3817, + "step": 96 + }, + { + "epoch": 0.8508771929824561, + "grad_norm": 3.6144917011260986, + "learning_rate": 4.85e-06, + "loss": 0.3618, + "step": 97 + }, + { + "epoch": 0.8596491228070176, + "grad_norm": 5.349407196044922, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.3218, + "step": 98 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 13.671236991882324, + "learning_rate": 4.95e-06, + "loss": 0.3329, + "step": 99 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 5.84046745300293, + "learning_rate": 5e-06, + "loss": 0.2967, + "step": 100 + }, + { + "epoch": 0.8859649122807017, + "grad_norm": 14.005338668823242, + "learning_rate": 4.999963827125897e-06, + "loss": 0.303, + "step": 101 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 9.18114185333252, + "learning_rate": 4.999855309550366e-06, + "loss": 0.2762, + "step": 102 + }, + { + "epoch": 0.9035087719298246, + "grad_norm": 3.0800487995147705, + "learning_rate": 4.999674450413725e-06, + "loss": 0.2628, + "step": 103 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 82.03578186035156, + "learning_rate": 4.999421254949728e-06, + "loss": 0.4065, + "step": 104 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 77.66315460205078, + "learning_rate": 4.99909573048542e-06, + "loss": 0.4307, + "step": 105 + }, + { + "epoch": 0.9298245614035088, + "grad_norm": 18.28767967224121, + "learning_rate": 4.998697886440927e-06, + "loss": 0.2571, + "step": 106 + }, + { + "epoch": 0.9385964912280702, + "grad_norm": 5.960445880889893, + "learning_rate": 4.998227734329177e-06, + "loss": 0.2847, + "step": 107 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 5.437699794769287, + "learning_rate": 4.9976852877555755e-06, + "loss": 0.2728, + "step": 108 + }, + { + "epoch": 0.956140350877193, + "grad_norm": 3.379631280899048, + "learning_rate": 4.997070562417602e-06, + "loss": 0.2467, + "step": 109 + }, + { + "epoch": 0.9649122807017544, + "grad_norm": 3.1625075340270996, + "learning_rate": 4.996383576104362e-06, + "loss": 0.2273, + "step": 110 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 15.588600158691406, + "learning_rate": 4.995624348696071e-06, + "loss": 0.2486, + "step": 111 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 2.631044387817383, + "learning_rate": 4.9947929021634815e-06, + "loss": 0.1964, + "step": 112 + }, + { + "epoch": 0.9912280701754386, + "grad_norm": 4.706504821777344, + "learning_rate": 4.993889260567239e-06, + "loss": 0.1901, + "step": 113 + }, + { + "epoch": 1.0, + "grad_norm": 10.368465423583984, + "learning_rate": 4.9929134500571954e-06, + "loss": 0.1996, + "step": 114 + }, + { + "epoch": 1.0087719298245614, + "grad_norm": 30.44986343383789, + "learning_rate": 4.991865498871647e-06, + "loss": 0.2606, + "step": 115 + }, + { + "epoch": 1.0175438596491229, + "grad_norm": 14.421515464782715, + "learning_rate": 4.99074543733652e-06, + "loss": 0.2394, + "step": 116 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 14.072005271911621, + "learning_rate": 4.989553297864489e-06, + "loss": 0.2288, + "step": 117 + }, + { + "epoch": 1.0350877192982457, + "grad_norm": 4.395325660705566, + "learning_rate": 4.988289114954045e-06, + "loss": 0.2129, + "step": 118 + }, + { + "epoch": 1.043859649122807, + "grad_norm": 7.286703586578369, + "learning_rate": 4.986952925188489e-06, + "loss": 0.186, + "step": 119 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 8.332784652709961, + "learning_rate": 4.98554476723488e-06, + "loss": 0.178, + "step": 120 + }, + { + "epoch": 1.0614035087719298, + "grad_norm": 1.3646447658538818, + "learning_rate": 4.984064681842917e-06, + "loss": 0.1687, + "step": 121 + }, + { + "epoch": 1.0701754385964912, + "grad_norm": 4.494940757751465, + "learning_rate": 4.982512711843753e-06, + "loss": 0.1881, + "step": 122 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 3.3929836750030518, + "learning_rate": 4.980888902148757e-06, + "loss": 0.1764, + "step": 123 + }, + { + "epoch": 1.087719298245614, + "grad_norm": 1.8281155824661255, + "learning_rate": 4.979193299748225e-06, + "loss": 0.1602, + "step": 124 + }, + { + "epoch": 1.0964912280701755, + "grad_norm": 3.494239568710327, + "learning_rate": 4.977425953710005e-06, + "loss": 0.1729, + "step": 125 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 1.500410556793213, + "learning_rate": 4.975586915178084e-06, + "loss": 0.1666, + "step": 126 + }, + { + "epoch": 1.1140350877192982, + "grad_norm": 1.4680222272872925, + "learning_rate": 4.973676237371111e-06, + "loss": 0.159, + "step": 127 + }, + { + "epoch": 1.1228070175438596, + "grad_norm": 3.0383460521698, + "learning_rate": 4.971693975580851e-06, + "loss": 0.1484, + "step": 128 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 3.74821138381958, + "learning_rate": 4.969640187170591e-06, + "loss": 0.1586, + "step": 129 + }, + { + "epoch": 1.1403508771929824, + "grad_norm": 4.682602405548096, + "learning_rate": 4.967514931573473e-06, + "loss": 0.1619, + "step": 130 + }, + { + "epoch": 1.1491228070175439, + "grad_norm": 3.90673565864563, + "learning_rate": 4.965318270290779e-06, + "loss": 0.164, + "step": 131 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 2.2017388343811035, + "learning_rate": 4.963050266890152e-06, + "loss": 0.1499, + "step": 132 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 2.4211816787719727, + "learning_rate": 4.960710987003753e-06, + "loss": 0.1387, + "step": 133 + }, + { + "epoch": 1.1754385964912282, + "grad_norm": 1.7753759622573853, + "learning_rate": 4.958300498326363e-06, + "loss": 0.1441, + "step": 134 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 1.5529910326004028, + "learning_rate": 4.955818870613425e-06, + "loss": 0.1304, + "step": 135 + }, + { + "epoch": 1.1929824561403508, + "grad_norm": 2.090593099594116, + "learning_rate": 4.953266175679023e-06, + "loss": 0.1419, + "step": 136 + }, + { + "epoch": 1.2017543859649122, + "grad_norm": 2.7141878604888916, + "learning_rate": 4.95064248739381e-06, + "loss": 0.1444, + "step": 137 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 2.3690481185913086, + "learning_rate": 4.947947881682861e-06, + "loss": 0.1383, + "step": 138 + }, + { + "epoch": 1.219298245614035, + "grad_norm": 2.2403147220611572, + "learning_rate": 4.945182436523482e-06, + "loss": 0.1418, + "step": 139 + }, + { + "epoch": 1.2280701754385965, + "grad_norm": 1.3939160108566284, + "learning_rate": 4.942346231942955e-06, + "loss": 0.1307, + "step": 140 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 11.276732444763184, + "learning_rate": 4.939439350016214e-06, + "loss": 0.1397, + "step": 141 + }, + { + "epoch": 1.2456140350877192, + "grad_norm": 8.260516166687012, + "learning_rate": 4.9364618748634794e-06, + "loss": 0.1426, + "step": 142 + }, + { + "epoch": 1.2543859649122808, + "grad_norm": 2.09720516204834, + "learning_rate": 4.933413892647819e-06, + "loss": 0.1323, + "step": 143 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.802125334739685, + "learning_rate": 4.9302954915726535e-06, + "loss": 0.1304, + "step": 144 + }, + { + "epoch": 1.2719298245614035, + "grad_norm": 1.7151471376419067, + "learning_rate": 4.927106761879207e-06, + "loss": 0.1264, + "step": 145 + }, + { + "epoch": 1.280701754385965, + "grad_norm": 1.6970336437225342, + "learning_rate": 4.923847795843894e-06, + "loss": 0.1227, + "step": 146 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 16.60441017150879, + "learning_rate": 4.920518687775647e-06, + "loss": 0.1606, + "step": 147 + }, + { + "epoch": 1.2982456140350878, + "grad_norm": 6.470354080200195, + "learning_rate": 4.917119534013194e-06, + "loss": 0.1447, + "step": 148 + }, + { + "epoch": 1.3070175438596492, + "grad_norm": 1.4908231496810913, + "learning_rate": 4.913650432922264e-06, + "loss": 0.1343, + "step": 149 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 3.19964861869812, + "learning_rate": 4.91011148489274e-06, + "loss": 0.1354, + "step": 150 + }, + { + "epoch": 1.3245614035087718, + "grad_norm": 2.6052839756011963, + "learning_rate": 4.906502792335761e-06, + "loss": 0.1342, + "step": 151 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.0719165802001953, + "learning_rate": 4.9028244596807525e-06, + "loss": 0.1359, + "step": 152 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.8086919784545898, + "learning_rate": 4.899076593372405e-06, + "loss": 0.1279, + "step": 153 + }, + { + "epoch": 1.3508771929824561, + "grad_norm": 1.0056848526000977, + "learning_rate": 4.8952593018675955e-06, + "loss": 0.1162, + "step": 154 + }, + { + "epoch": 1.3596491228070176, + "grad_norm": 5.72553014755249, + "learning_rate": 4.891372695632249e-06, + "loss": 0.1315, + "step": 155 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.522894024848938, + "learning_rate": 4.887416887138139e-06, + "loss": 0.1266, + "step": 156 + }, + { + "epoch": 1.3771929824561404, + "grad_norm": 2.019472122192383, + "learning_rate": 4.883391990859635e-06, + "loss": 0.1262, + "step": 157 + }, + { + "epoch": 1.3859649122807016, + "grad_norm": 1.8594422340393066, + "learning_rate": 4.879298123270391e-06, + "loss": 0.125, + "step": 158 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 1.365377426147461, + "learning_rate": 4.8751354028399725e-06, + "loss": 0.1218, + "step": 159 + }, + { + "epoch": 1.4035087719298245, + "grad_norm": 3.553309917449951, + "learning_rate": 4.870903950030429e-06, + "loss": 0.1272, + "step": 160 + }, + { + "epoch": 1.412280701754386, + "grad_norm": 2.1770920753479004, + "learning_rate": 4.866603887292809e-06, + "loss": 0.1213, + "step": 161 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 1.6058955192565918, + "learning_rate": 4.862235339063613e-06, + "loss": 0.1173, + "step": 162 + }, + { + "epoch": 1.4298245614035088, + "grad_norm": 1.3208314180374146, + "learning_rate": 4.857798431761199e-06, + "loss": 0.1183, + "step": 163 + }, + { + "epoch": 1.4385964912280702, + "grad_norm": 1.282729983329773, + "learning_rate": 4.853293293782118e-06, + "loss": 0.1209, + "step": 164 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 1.3838152885437012, + "learning_rate": 4.848720055497401e-06, + "loss": 0.1198, + "step": 165 + }, + { + "epoch": 1.456140350877193, + "grad_norm": 1.2930737733840942, + "learning_rate": 4.844078849248785e-06, + "loss": 0.1268, + "step": 166 + }, + { + "epoch": 1.4649122807017543, + "grad_norm": 1.7022266387939453, + "learning_rate": 4.839369809344888e-06, + "loss": 0.1198, + "step": 167 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.0927815437316895, + "learning_rate": 4.834593072057313e-06, + "loss": 0.1132, + "step": 168 + }, + { + "epoch": 1.4824561403508771, + "grad_norm": 0.9326333999633789, + "learning_rate": 4.829748775616716e-06, + "loss": 0.1193, + "step": 169 + }, + { + "epoch": 1.4912280701754386, + "grad_norm": 1.3564742803573608, + "learning_rate": 4.8248370602087954e-06, + "loss": 0.118, + "step": 170 + }, + { + "epoch": 1.5, + "grad_norm": 1.19778573513031, + "learning_rate": 4.819858067970243e-06, + "loss": 0.1122, + "step": 171 + }, + { + "epoch": 1.5087719298245614, + "grad_norm": 2.8438351154327393, + "learning_rate": 4.814811942984625e-06, + "loss": 0.1217, + "step": 172 + }, + { + "epoch": 1.5175438596491229, + "grad_norm": 1.0701063871383667, + "learning_rate": 4.809698831278217e-06, + "loss": 0.1114, + "step": 173 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.9053553938865662, + "learning_rate": 4.804518880815776e-06, + "loss": 0.1178, + "step": 174 + }, + { + "epoch": 1.5350877192982457, + "grad_norm": 0.42274603247642517, + "learning_rate": 4.799272241496259e-06, + "loss": 0.1091, + "step": 175 + }, + { + "epoch": 1.543859649122807, + "grad_norm": 0.8576470017433167, + "learning_rate": 4.793959065148484e-06, + "loss": 0.1134, + "step": 176 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.5910662412643433, + "learning_rate": 4.78857950552674e-06, + "loss": 0.1148, + "step": 177 + }, + { + "epoch": 1.5614035087719298, + "grad_norm": 0.8761632442474365, + "learning_rate": 4.783133718306331e-06, + "loss": 0.1125, + "step": 178 + }, + { + "epoch": 1.5701754385964912, + "grad_norm": 1.9190795421600342, + "learning_rate": 4.777621861079079e-06, + "loss": 0.1148, + "step": 179 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.6199957728385925, + "learning_rate": 4.772044093348757e-06, + "loss": 0.1097, + "step": 180 + }, + { + "epoch": 1.587719298245614, + "grad_norm": 1.562089443206787, + "learning_rate": 4.766400576526479e-06, + "loss": 0.1097, + "step": 181 + }, + { + "epoch": 1.5964912280701755, + "grad_norm": 1.4957091808319092, + "learning_rate": 4.760691473926021e-06, + "loss": 0.1216, + "step": 182 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.9863570332527161, + "learning_rate": 4.754916950759105e-06, + "loss": 0.1122, + "step": 183 + }, + { + "epoch": 1.6140350877192984, + "grad_norm": 0.5803346633911133, + "learning_rate": 4.749077174130609e-06, + "loss": 0.1103, + "step": 184 + }, + { + "epoch": 1.6228070175438596, + "grad_norm": 1.8789891004562378, + "learning_rate": 4.743172313033738e-06, + "loss": 0.1191, + "step": 185 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.8731380105018616, + "learning_rate": 4.7372025383451285e-06, + "loss": 0.1154, + "step": 186 + }, + { + "epoch": 1.6403508771929824, + "grad_norm": 1.3535627126693726, + "learning_rate": 4.7311680228199075e-06, + "loss": 0.1123, + "step": 187 + }, + { + "epoch": 1.6491228070175439, + "grad_norm": 0.7211089134216309, + "learning_rate": 4.725068941086693e-06, + "loss": 0.1134, + "step": 188 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 1.4752328395843506, + "learning_rate": 4.718905469642534e-06, + "loss": 0.1185, + "step": 189 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.9822680354118347, + "learning_rate": 4.712677786847814e-06, + "loss": 0.1146, + "step": 190 + }, + { + "epoch": 1.6754385964912282, + "grad_norm": 1.1308330297470093, + "learning_rate": 4.706386072921083e-06, + "loss": 0.1061, + "step": 191 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 5.331939697265625, + "learning_rate": 4.70003050993384e-06, + "loss": 0.1153, + "step": 192 + }, + { + "epoch": 1.692982456140351, + "grad_norm": 0.6911673545837402, + "learning_rate": 4.6936112818052674e-06, + "loss": 0.1098, + "step": 193 + }, + { + "epoch": 1.7017543859649122, + "grad_norm": 0.5160980224609375, + "learning_rate": 4.687128574296912e-06, + "loss": 0.1073, + "step": 194 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 1.5724798440933228, + "learning_rate": 4.680582575007303e-06, + "loss": 0.121, + "step": 195 + }, + { + "epoch": 1.719298245614035, + "grad_norm": 1.3960011005401611, + "learning_rate": 4.6739734733665275e-06, + "loss": 0.1145, + "step": 196 + }, + { + "epoch": 1.7280701754385965, + "grad_norm": 1.4949183464050293, + "learning_rate": 4.6673014606307465e-06, + "loss": 0.1166, + "step": 197 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 1.6873422861099243, + "learning_rate": 4.660566729876661e-06, + "loss": 0.1115, + "step": 198 + }, + { + "epoch": 1.7456140350877192, + "grad_norm": 1.3443641662597656, + "learning_rate": 4.653769475995926e-06, + "loss": 0.1119, + "step": 199 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 0.807525098323822, + "learning_rate": 4.646909895689508e-06, + "loss": 0.1059, + "step": 200 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 1.589316964149475, + "learning_rate": 4.639988187461995e-06, + "loss": 0.1151, + "step": 201 + }, + { + "epoch": 1.7719298245614035, + "grad_norm": 2.474756956100464, + "learning_rate": 4.633004551615851e-06, + "loss": 0.116, + "step": 202 + }, + { + "epoch": 1.780701754385965, + "grad_norm": 0.6210195422172546, + "learning_rate": 4.62595919024562e-06, + "loss": 0.1097, + "step": 203 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.7217905521392822, + "learning_rate": 4.618852307232078e-06, + "loss": 0.1117, + "step": 204 + }, + { + "epoch": 1.7982456140350878, + "grad_norm": 1.551251769065857, + "learning_rate": 4.611684108236334e-06, + "loss": 0.113, + "step": 205 + }, + { + "epoch": 1.807017543859649, + "grad_norm": 0.6619828939437866, + "learning_rate": 4.604454800693874e-06, + "loss": 0.113, + "step": 206 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.9461805820465088, + "learning_rate": 4.597164593808564e-06, + "loss": 0.1093, + "step": 207 + }, + { + "epoch": 1.8245614035087718, + "grad_norm": 1.2926547527313232, + "learning_rate": 4.589813698546592e-06, + "loss": 0.1128, + "step": 208 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.8754212856292725, + "learning_rate": 4.582402327630368e-06, + "loss": 0.1104, + "step": 209 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.846051812171936, + "learning_rate": 4.574930695532357e-06, + "loss": 0.1105, + "step": 210 + }, + { + "epoch": 1.8508771929824561, + "grad_norm": 1.3332515954971313, + "learning_rate": 4.567399018468889e-06, + "loss": 0.1101, + "step": 211 + }, + { + "epoch": 1.8596491228070176, + "grad_norm": 0.8729192614555359, + "learning_rate": 4.5598075143938855e-06, + "loss": 0.1081, + "step": 212 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.8618345260620117, + "learning_rate": 4.552156402992567e-06, + "loss": 0.1059, + "step": 213 + }, + { + "epoch": 1.8771929824561404, + "grad_norm": 1.2135930061340332, + "learning_rate": 4.544445905675082e-06, + "loss": 0.1105, + "step": 214 + }, + { + "epoch": 1.8859649122807016, + "grad_norm": 0.8405666351318359, + "learning_rate": 4.536676245570111e-06, + "loss": 0.1118, + "step": 215 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.42860639095306396, + "learning_rate": 4.528847647518403e-06, + "loss": 0.1093, + "step": 216 + }, + { + "epoch": 1.9035087719298245, + "grad_norm": 1.1538206338882446, + "learning_rate": 4.520960338066271e-06, + "loss": 0.1088, + "step": 217 + }, + { + "epoch": 1.912280701754386, + "grad_norm": 0.5870749354362488, + "learning_rate": 4.513014545459038e-06, + "loss": 0.1061, + "step": 218 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.7279748916625977, + "learning_rate": 4.505010499634427e-06, + "loss": 0.1032, + "step": 219 + }, + { + "epoch": 1.9298245614035088, + "grad_norm": 0.6331414580345154, + "learning_rate": 4.4969484322159125e-06, + "loss": 0.1109, + "step": 220 + }, + { + "epoch": 1.9385964912280702, + "grad_norm": 0.9024543166160583, + "learning_rate": 4.488828576506014e-06, + "loss": 0.1094, + "step": 221 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 3.540376901626587, + "learning_rate": 4.480651167479545e-06, + "loss": 0.1154, + "step": 222 + }, + { + "epoch": 1.956140350877193, + "grad_norm": 0.9506739377975464, + "learning_rate": 4.472416441776817e-06, + "loss": 0.108, + "step": 223 + }, + { + "epoch": 1.9649122807017543, + "grad_norm": 0.6585081815719604, + "learning_rate": 4.464124637696786e-06, + "loss": 0.1033, + "step": 224 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 1.143038034439087, + "learning_rate": 4.455775995190161e-06, + "loss": 0.1092, + "step": 225 + }, + { + "epoch": 1.9824561403508771, + "grad_norm": 1.148261547088623, + "learning_rate": 4.4473707558524555e-06, + "loss": 0.1076, + "step": 226 + }, + { + "epoch": 1.9912280701754386, + "grad_norm": 0.7375811338424683, + "learning_rate": 4.438909162917003e-06, + "loss": 0.108, + "step": 227 + }, + { + "epoch": 2.0, + "grad_norm": 0.5254591703414917, + "learning_rate": 4.430391461247911e-06, + "loss": 0.1079, + "step": 228 + }, + { + "epoch": 2.008771929824561, + "grad_norm": 1.0198495388031006, + "learning_rate": 4.42181789733298e-06, + "loss": 0.1083, + "step": 229 + }, + { + "epoch": 2.017543859649123, + "grad_norm": 0.9234157800674438, + "learning_rate": 4.413188719276569e-06, + "loss": 0.1084, + "step": 230 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 0.5215068459510803, + "learning_rate": 4.404504176792414e-06, + "loss": 0.1067, + "step": 231 + }, + { + "epoch": 2.0350877192982457, + "grad_norm": 0.9296736121177673, + "learning_rate": 4.3957645211964065e-06, + "loss": 0.1066, + "step": 232 + }, + { + "epoch": 2.043859649122807, + "grad_norm": 0.8660671710968018, + "learning_rate": 4.386970005399314e-06, + "loss": 0.108, + "step": 233 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 0.6014883518218994, + "learning_rate": 4.378120883899467e-06, + "loss": 0.1068, + "step": 234 + }, + { + "epoch": 2.06140350877193, + "grad_norm": 0.6370371580123901, + "learning_rate": 4.369217412775393e-06, + "loss": 0.1076, + "step": 235 + }, + { + "epoch": 2.0701754385964914, + "grad_norm": 0.9806828498840332, + "learning_rate": 4.360259849678402e-06, + "loss": 0.1071, + "step": 236 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 0.6093440651893616, + "learning_rate": 4.351248453825137e-06, + "loss": 0.1038, + "step": 237 + }, + { + "epoch": 2.087719298245614, + "grad_norm": 1.3494842052459717, + "learning_rate": 4.3421834859900695e-06, + "loss": 0.1105, + "step": 238 + }, + { + "epoch": 2.0964912280701755, + "grad_norm": 0.7621576189994812, + "learning_rate": 4.333065208497949e-06, + "loss": 0.1048, + "step": 239 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.5918282866477966, + "learning_rate": 4.3238938852162195e-06, + "loss": 0.1086, + "step": 240 + }, + { + "epoch": 2.1140350877192984, + "grad_norm": 0.7048676609992981, + "learning_rate": 4.314669781547379e-06, + "loss": 0.1061, + "step": 241 + }, + { + "epoch": 2.1228070175438596, + "grad_norm": 1.0750821828842163, + "learning_rate": 4.305393164421301e-06, + "loss": 0.1082, + "step": 242 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 0.6171414852142334, + "learning_rate": 4.296064302287507e-06, + "loss": 0.1039, + "step": 243 + }, + { + "epoch": 2.1403508771929824, + "grad_norm": 0.8080905079841614, + "learning_rate": 4.286683465107403e-06, + "loss": 0.1069, + "step": 244 + }, + { + "epoch": 2.1491228070175437, + "grad_norm": 0.5281466245651245, + "learning_rate": 4.277250924346461e-06, + "loss": 0.1069, + "step": 245 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.8070254325866699, + "learning_rate": 4.267766952966369e-06, + "loss": 0.1061, + "step": 246 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.8560577630996704, + "learning_rate": 4.25823182541713e-06, + "loss": 0.1116, + "step": 247 + }, + { + "epoch": 2.175438596491228, + "grad_norm": 0.7772330045700073, + "learning_rate": 4.2486458176291176e-06, + "loss": 0.1092, + "step": 248 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 0.814601719379425, + "learning_rate": 4.239009207005096e-06, + "loss": 0.1093, + "step": 249 + }, + { + "epoch": 2.192982456140351, + "grad_norm": 0.957789957523346, + "learning_rate": 4.2293222724121855e-06, + "loss": 0.1075, + "step": 250 + }, + { + "epoch": 2.2017543859649122, + "grad_norm": 0.500062108039856, + "learning_rate": 4.219585294173799e-06, + "loss": 0.1048, + "step": 251 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.3866419792175293, + "learning_rate": 4.209798554061527e-06, + "loss": 0.1074, + "step": 252 + }, + { + "epoch": 2.219298245614035, + "grad_norm": 1.1853291988372803, + "learning_rate": 4.199962335286985e-06, + "loss": 0.1076, + "step": 253 + }, + { + "epoch": 2.2280701754385963, + "grad_norm": 0.36602887511253357, + "learning_rate": 4.1900769224936125e-06, + "loss": 0.108, + "step": 254 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.2530711889266968, + "learning_rate": 4.180142601748447e-06, + "loss": 0.1041, + "step": 255 + }, + { + "epoch": 2.245614035087719, + "grad_norm": 1.3067054748535156, + "learning_rate": 4.170159660533834e-06, + "loss": 0.1087, + "step": 256 + }, + { + "epoch": 2.254385964912281, + "grad_norm": 0.3442043960094452, + "learning_rate": 4.160128387739114e-06, + "loss": 0.1099, + "step": 257 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 1.174796462059021, + "learning_rate": 4.150049073652262e-06, + "loss": 0.1063, + "step": 258 + }, + { + "epoch": 2.2719298245614037, + "grad_norm": 0.5719411969184875, + "learning_rate": 4.1399220099514845e-06, + "loss": 0.1043, + "step": 259 + }, + { + "epoch": 2.280701754385965, + "grad_norm": 0.7268956303596497, + "learning_rate": 4.129747489696781e-06, + "loss": 0.1038, + "step": 260 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 0.7028316259384155, + "learning_rate": 4.119525807321467e-06, + "loss": 0.1052, + "step": 261 + }, + { + "epoch": 2.2982456140350878, + "grad_norm": 1.015335202217102, + "learning_rate": 4.109257258623644e-06, + "loss": 0.1116, + "step": 262 + }, + { + "epoch": 2.307017543859649, + "grad_norm": 0.7141755819320679, + "learning_rate": 4.098942140757646e-06, + "loss": 0.108, + "step": 263 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.7656403183937073, + "learning_rate": 4.0885807522254435e-06, + "loss": 0.1043, + "step": 264 + }, + { + "epoch": 2.324561403508772, + "grad_norm": 0.43293774127960205, + "learning_rate": 4.078173392867998e-06, + "loss": 0.1048, + "step": 265 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.6755763292312622, + "learning_rate": 4.0677203638565895e-06, + "loss": 0.1064, + "step": 266 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 0.9648827314376831, + "learning_rate": 4.0572219676841e-06, + "loss": 0.1088, + "step": 267 + }, + { + "epoch": 2.3508771929824563, + "grad_norm": 0.32724836468696594, + "learning_rate": 4.046678508156259e-06, + "loss": 0.1077, + "step": 268 + }, + { + "epoch": 2.3596491228070176, + "grad_norm": 0.4696657061576843, + "learning_rate": 4.036090290382855e-06, + "loss": 0.1067, + "step": 269 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.33901306986808777, + "learning_rate": 4.025457620768901e-06, + "loss": 0.105, + "step": 270 + }, + { + "epoch": 2.3771929824561404, + "grad_norm": 0.5703794360160828, + "learning_rate": 4.014780807005775e-06, + "loss": 0.1033, + "step": 271 + }, + { + "epoch": 2.3859649122807016, + "grad_norm": 0.9639355540275574, + "learning_rate": 4.004060158062306e-06, + "loss": 0.1041, + "step": 272 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 0.8851558566093445, + "learning_rate": 3.993295984175845e-06, + "loss": 0.1064, + "step": 273 + }, + { + "epoch": 2.4035087719298245, + "grad_norm": 0.5200062990188599, + "learning_rate": 3.982488596843276e-06, + "loss": 0.1056, + "step": 274 + }, + { + "epoch": 2.412280701754386, + "grad_norm": 1.160823106765747, + "learning_rate": 3.971638308812007e-06, + "loss": 0.1069, + "step": 275 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 1.0191210508346558, + "learning_rate": 3.9607454340709215e-06, + "loss": 0.1042, + "step": 276 + }, + { + "epoch": 2.4298245614035086, + "grad_norm": 0.37181487679481506, + "learning_rate": 3.949810287841289e-06, + "loss": 0.1062, + "step": 277 + }, + { + "epoch": 2.43859649122807, + "grad_norm": 0.9328593611717224, + "learning_rate": 3.9388331865676436e-06, + "loss": 0.1086, + "step": 278 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 0.8024734258651733, + "learning_rate": 3.927814447908625e-06, + "loss": 0.1051, + "step": 279 + }, + { + "epoch": 2.456140350877193, + "grad_norm": 0.9746696352958679, + "learning_rate": 3.916754390727795e-06, + "loss": 0.1041, + "step": 280 + }, + { + "epoch": 2.4649122807017543, + "grad_norm": 0.5457844138145447, + "learning_rate": 3.905653335084394e-06, + "loss": 0.1052, + "step": 281 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 1.0736924409866333, + "learning_rate": 3.8945116022240945e-06, + "loss": 0.1075, + "step": 282 + }, + { + "epoch": 2.482456140350877, + "grad_norm": 0.6335628032684326, + "learning_rate": 3.8833295145696964e-06, + "loss": 0.1036, + "step": 283 + }, + { + "epoch": 2.4912280701754383, + "grad_norm": 0.6909618377685547, + "learning_rate": 3.872107395711799e-06, + "loss": 0.1089, + "step": 284 + }, + { + "epoch": 2.5, + "grad_norm": 2.1871702671051025, + "learning_rate": 3.860845570399435e-06, + "loss": 0.1066, + "step": 285 + }, + { + "epoch": 2.5087719298245617, + "grad_norm": 0.5831722617149353, + "learning_rate": 3.849544364530678e-06, + "loss": 0.1055, + "step": 286 + }, + { + "epoch": 2.517543859649123, + "grad_norm": 0.5302637815475464, + "learning_rate": 3.838204105143204e-06, + "loss": 0.1057, + "step": 287 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.6348035931587219, + "learning_rate": 3.8268251204048335e-06, + "loss": 0.1089, + "step": 288 + }, + { + "epoch": 2.5350877192982457, + "grad_norm": 2.1932008266448975, + "learning_rate": 3.815407739604033e-06, + "loss": 0.1043, + "step": 289 + }, + { + "epoch": 2.543859649122807, + "grad_norm": 0.4388940930366516, + "learning_rate": 3.803952293140385e-06, + "loss": 0.1055, + "step": 290 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 0.6853339076042175, + "learning_rate": 3.7924591125150265e-06, + "loss": 0.1036, + "step": 291 + }, + { + "epoch": 2.56140350877193, + "grad_norm": 0.34744876623153687, + "learning_rate": 3.78092853032106e-06, + "loss": 0.1025, + "step": 292 + }, + { + "epoch": 2.5701754385964914, + "grad_norm": 0.9523847699165344, + "learning_rate": 3.769360880233922e-06, + "loss": 0.1067, + "step": 293 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 1.303745985031128, + "learning_rate": 3.7577564970017338e-06, + "loss": 0.1082, + "step": 294 + }, + { + "epoch": 2.587719298245614, + "grad_norm": 0.9468981623649597, + "learning_rate": 3.7461157164356103e-06, + "loss": 0.1055, + "step": 295 + }, + { + "epoch": 2.5964912280701755, + "grad_norm": 0.7204175591468811, + "learning_rate": 3.7344388753999434e-06, + "loss": 0.1055, + "step": 296 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 0.5110165476799011, + "learning_rate": 3.7227263118026537e-06, + "loss": 0.1092, + "step": 297 + }, + { + "epoch": 2.6140350877192984, + "grad_norm": 0.6483246088027954, + "learning_rate": 3.7109783645854116e-06, + "loss": 0.1078, + "step": 298 + }, + { + "epoch": 2.6228070175438596, + "grad_norm": 0.5058422684669495, + "learning_rate": 3.699195373713831e-06, + "loss": 0.1073, + "step": 299 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.4123518764972687, + "learning_rate": 3.6873776801676265e-06, + "loss": 0.1053, + "step": 300 + }, + { + "epoch": 2.6403508771929824, + "grad_norm": 1.0864709615707397, + "learning_rate": 3.675525625930751e-06, + "loss": 0.1048, + "step": 301 + }, + { + "epoch": 2.6491228070175437, + "grad_norm": 1.0264904499053955, + "learning_rate": 3.6636395539814975e-06, + "loss": 0.1059, + "step": 302 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 0.7724822163581848, + "learning_rate": 3.651719808282573e-06, + "loss": 0.1063, + "step": 303 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.7474755644798279, + "learning_rate": 3.6397667337711475e-06, + "loss": 0.1034, + "step": 304 + }, + { + "epoch": 2.675438596491228, + "grad_norm": 0.5628909468650818, + "learning_rate": 3.6277806763488666e-06, + "loss": 0.1026, + "step": 305 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.9070547819137573, + "learning_rate": 3.6157619828718477e-06, + "loss": 0.1031, + "step": 306 + }, + { + "epoch": 2.692982456140351, + "grad_norm": 0.6968091130256653, + "learning_rate": 3.603711001140641e-06, + "loss": 0.1068, + "step": 307 + }, + { + "epoch": 2.7017543859649122, + "grad_norm": 0.3764977753162384, + "learning_rate": 3.5916280798901604e-06, + "loss": 0.1038, + "step": 308 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 5.012625694274902, + "learning_rate": 3.5795135687795984e-06, + "loss": 0.1129, + "step": 309 + }, + { + "epoch": 2.719298245614035, + "grad_norm": 0.6745572686195374, + "learning_rate": 3.567367818382303e-06, + "loss": 0.1071, + "step": 310 + }, + { + "epoch": 2.7280701754385968, + "grad_norm": 1.0659606456756592, + "learning_rate": 3.555191180175634e-06, + "loss": 0.1067, + "step": 311 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 1.7312604188919067, + "learning_rate": 3.5429840065307924e-06, + "loss": 0.1101, + "step": 312 + }, + { + "epoch": 2.745614035087719, + "grad_norm": 1.100364327430725, + "learning_rate": 3.5307466507026223e-06, + "loss": 0.1098, + "step": 313 + }, + { + "epoch": 2.754385964912281, + "grad_norm": 1.0390428304672241, + "learning_rate": 3.5184794668193893e-06, + "loss": 0.1094, + "step": 314 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.3369971811771393, + "learning_rate": 3.5061828098725327e-06, + "loss": 0.1053, + "step": 315 + }, + { + "epoch": 2.7719298245614032, + "grad_norm": 0.6130257248878479, + "learning_rate": 3.4938570357063906e-06, + "loss": 0.106, + "step": 316 + }, + { + "epoch": 2.780701754385965, + "grad_norm": 0.6387595534324646, + "learning_rate": 3.481502501007904e-06, + "loss": 0.1044, + "step": 317 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 1.0731587409973145, + "learning_rate": 3.469119563296296e-06, + "loss": 0.1097, + "step": 318 + }, + { + "epoch": 2.7982456140350878, + "grad_norm": 0.8096229434013367, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.1076, + "step": 319 + }, + { + "epoch": 2.807017543859649, + "grad_norm": 0.5034844279289246, + "learning_rate": 3.444269913009912e-06, + "loss": 0.1071, + "step": 320 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 0.675139307975769, + "learning_rate": 3.4318039195417536e-06, + "loss": 0.1039, + "step": 321 + }, + { + "epoch": 2.824561403508772, + "grad_norm": 0.7330355644226074, + "learning_rate": 3.4193109612528972e-06, + "loss": 0.1044, + "step": 322 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.6558271646499634, + "learning_rate": 3.4067913996683115e-06, + "loss": 0.1051, + "step": 323 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.8411844372749329, + "learning_rate": 3.3942455970828146e-06, + "loss": 0.1063, + "step": 324 + }, + { + "epoch": 2.8508771929824563, + "grad_norm": 0.4817325174808502, + "learning_rate": 3.3816739165505964e-06, + "loss": 0.105, + "step": 325 + }, + { + "epoch": 2.8596491228070176, + "grad_norm": 0.424554705619812, + "learning_rate": 3.3690767218747104e-06, + "loss": 0.1037, + "step": 326 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 1.0054417848587036, + "learning_rate": 3.3564543775965475e-06, + "loss": 0.1058, + "step": 327 + }, + { + "epoch": 2.8771929824561404, + "grad_norm": 0.8984584808349609, + "learning_rate": 3.3438072489852837e-06, + "loss": 0.1079, + "step": 328 + }, + { + "epoch": 2.8859649122807016, + "grad_norm": 0.6779558062553406, + "learning_rate": 3.331135702027311e-06, + "loss": 0.1046, + "step": 329 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.6931657195091248, + "learning_rate": 3.318440103415649e-06, + "loss": 0.1106, + "step": 330 + }, + { + "epoch": 2.9035087719298245, + "grad_norm": 0.705264151096344, + "learning_rate": 3.305720820539329e-06, + "loss": 0.104, + "step": 331 + }, + { + "epoch": 2.912280701754386, + "grad_norm": 0.7799407839775085, + "learning_rate": 3.2929782214727657e-06, + "loss": 0.1019, + "step": 332 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 0.7583760619163513, + "learning_rate": 3.2802126749651042e-06, + "loss": 0.1049, + "step": 333 + }, + { + "epoch": 2.9298245614035086, + "grad_norm": 0.6145837306976318, + "learning_rate": 3.2674245504295505e-06, + "loss": 0.104, + "step": 334 + }, + { + "epoch": 2.93859649122807, + "grad_norm": 0.5170779228210449, + "learning_rate": 3.254614217932679e-06, + "loss": 0.1024, + "step": 335 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.6850940585136414, + "learning_rate": 3.241782048183726e-06, + "loss": 0.1047, + "step": 336 + }, + { + "epoch": 2.956140350877193, + "grad_norm": 0.7307694554328918, + "learning_rate": 3.2289284125238597e-06, + "loss": 0.1032, + "step": 337 + }, + { + "epoch": 2.9649122807017543, + "grad_norm": 0.3386179208755493, + "learning_rate": 3.216053682915436e-06, + "loss": 0.1037, + "step": 338 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 0.7565059065818787, + "learning_rate": 3.203158231931234e-06, + "loss": 0.1048, + "step": 339 + }, + { + "epoch": 2.982456140350877, + "grad_norm": 0.7902039289474487, + "learning_rate": 3.190242432743673e-06, + "loss": 0.1068, + "step": 340 + }, + { + "epoch": 2.9912280701754383, + "grad_norm": 0.42595192790031433, + "learning_rate": 3.177306659114015e-06, + "loss": 0.1039, + "step": 341 + }, + { + "epoch": 3.0, + "grad_norm": 1.1214542388916016, + "learning_rate": 3.164351285381549e-06, + "loss": 0.1062, + "step": 342 + } + ], + "logging_steps": 1, + "max_steps": 684, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 114, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.037997022244556e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-342/training_args.bin b/checkpoint-342/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..38c27bdabb0e0e68242bce9d9302628a34f6e7cf --- /dev/null +++ b/checkpoint-342/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cb0553c2c3dd5a010aed55eae3afd8bd7f096b43ba03d25af54dc26191426ae +size 7992 diff --git a/checkpoint-342/zero_to_fp32.py b/checkpoint-342/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-342/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-456/README.md b/checkpoint-456/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f4a3934800eeb082a0cb833d7b6af4f68eed3615 --- /dev/null +++ b/checkpoint-456/README.md @@ -0,0 +1,202 @@ +--- +base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-456/adapter_config.json b/checkpoint-456/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be --- /dev/null +++ b/checkpoint-456/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-456/adapter_model.safetensors b/checkpoint-456/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..42a164c19b8991d795745b2be7f51614f9e1c94c --- /dev/null +++ b/checkpoint-456/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dccdb7039918f5ffc78444d0e12eeaba609108a1ca06b93d76a6d876e6261bed +size 9016826528 diff --git a/checkpoint-456/global_step456/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-456/global_step456/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..28c0b48334ed509e21364c83067d53c9aa7f48a3 --- /dev/null +++ b/checkpoint-456/global_step456/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef6d7f29aa4de04e9a43a18bc91e2b805f98f74bd58a41ad903e02e7d0892d90 +size 27050164444 diff --git a/checkpoint-456/global_step456/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-456/global_step456/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..68ac964fceeee099e6785a218a177277061604a6 --- /dev/null +++ b/checkpoint-456/global_step456/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da3b0512235d477b9bbfecacb366aafb4262703f193d00e52ab75dd8d8d57866 +size 27050169884 diff --git a/checkpoint-456/global_step456/mp_rank_00_model_states.pt b/checkpoint-456/global_step456/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca0ec2806350c36f92e6af618cec77020e3329b3 --- /dev/null +++ b/checkpoint-456/global_step456/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d25d7823729e2c703bfc8a32d1be4f363a6cf4448ba82fa6c7c3fed7bc41780 +size 9776788601 diff --git a/checkpoint-456/latest b/checkpoint-456/latest new file mode 100644 index 0000000000000000000000000000000000000000..dbd5ff49aa710762c49b97ba3da2fe7861cf8ba3 --- /dev/null +++ b/checkpoint-456/latest @@ -0,0 +1 @@ +global_step456 \ No newline at end of file diff --git a/checkpoint-456/rng_state_0.pth b/checkpoint-456/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ae0b1a9decf4ec9bb35071035fb26c2d4c93b67e --- /dev/null +++ b/checkpoint-456/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:558cb392136e9a34f3dc978e709dbf7e921016d196633280baa3af2f9b835feb +size 14512 diff --git a/checkpoint-456/rng_state_1.pth b/checkpoint-456/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..47cbe05f6d15e006bbe6a3733bfe0cfdc100ba87 --- /dev/null +++ b/checkpoint-456/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c56add9c7ad678528b4506397c292f971d6ba5e1526ee57775b0f10a018460b +size 14512 diff --git a/checkpoint-456/scheduler.pt b/checkpoint-456/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b1c0ffb3e608b05301b650729d1fd00684fe1c8 --- /dev/null +++ b/checkpoint-456/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb3bad9ba2764b50552b04ba92b9b14a087b1672d360cfafb090b7313e46de9c +size 1064 diff --git a/checkpoint-456/special_tokens_map.json b/checkpoint-456/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-456/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-456/tokenizer.json b/checkpoint-456/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-456/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-456/tokenizer_config.json b/checkpoint-456/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74 --- /dev/null +++ b/checkpoint-456/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-456/trainer_state.json b/checkpoint-456/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..52bcc74e2542bd0952c8ea0398287f8bb21489d7 --- /dev/null +++ b/checkpoint-456/trainer_state.json @@ -0,0 +1,3225 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 456, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008771929824561403, + "grad_norm": 39.56407165527344, + "learning_rate": 5.0000000000000004e-08, + "loss": 5.1375, + "step": 1 + }, + { + "epoch": 0.017543859649122806, + "grad_norm": 40.30452346801758, + "learning_rate": 1.0000000000000001e-07, + "loss": 5.1185, + "step": 2 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 40.062313079833984, + "learning_rate": 1.5000000000000002e-07, + "loss": 5.0762, + "step": 3 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 39.17148208618164, + "learning_rate": 2.0000000000000002e-07, + "loss": 5.016, + "step": 4 + }, + { + "epoch": 0.043859649122807015, + "grad_norm": 40.67367172241211, + "learning_rate": 2.5000000000000004e-07, + "loss": 5.0428, + "step": 5 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 38.18095016479492, + "learning_rate": 3.0000000000000004e-07, + "loss": 5.2025, + "step": 6 + }, + { + "epoch": 0.06140350877192982, + "grad_norm": 39.12940979003906, + "learning_rate": 3.5000000000000004e-07, + "loss": 4.9896, + "step": 7 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 38.84568405151367, + "learning_rate": 4.0000000000000003e-07, + "loss": 5.1078, + "step": 8 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 39.38333511352539, + "learning_rate": 4.5000000000000003e-07, + "loss": 5.0808, + "step": 9 + }, + { + "epoch": 0.08771929824561403, + "grad_norm": 39.427650451660156, + "learning_rate": 5.000000000000001e-07, + "loss": 5.0534, + "step": 10 + }, + { + "epoch": 0.09649122807017543, + "grad_norm": 39.29513168334961, + "learning_rate": 5.5e-07, + "loss": 5.058, + "step": 11 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 39.641231536865234, + "learning_rate": 6.000000000000001e-07, + "loss": 5.0317, + "step": 12 + }, + { + "epoch": 0.11403508771929824, + "grad_norm": 37.91259765625, + "learning_rate": 6.5e-07, + "loss": 4.912, + "step": 13 + }, + { + "epoch": 0.12280701754385964, + "grad_norm": 38.203548431396484, + "learning_rate": 7.000000000000001e-07, + "loss": 4.9705, + "step": 14 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 39.15998840332031, + "learning_rate": 7.5e-07, + "loss": 4.6962, + "step": 15 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 37.754669189453125, + "learning_rate": 8.000000000000001e-07, + "loss": 4.6262, + "step": 16 + }, + { + "epoch": 0.14912280701754385, + "grad_norm": 35.871490478515625, + "learning_rate": 8.500000000000001e-07, + "loss": 4.5422, + "step": 17 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 36.16888427734375, + "learning_rate": 9.000000000000001e-07, + "loss": 4.664, + "step": 18 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 33.520118713378906, + "learning_rate": 9.500000000000001e-07, + "loss": 4.4697, + "step": 19 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 30.896282196044922, + "learning_rate": 1.0000000000000002e-06, + "loss": 4.3568, + "step": 20 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 29.944643020629883, + "learning_rate": 1.0500000000000001e-06, + "loss": 4.2269, + "step": 21 + }, + { + "epoch": 0.19298245614035087, + "grad_norm": 25.224485397338867, + "learning_rate": 1.1e-06, + "loss": 4.1272, + "step": 22 + }, + { + "epoch": 0.20175438596491227, + "grad_norm": 24.410480499267578, + "learning_rate": 1.1500000000000002e-06, + "loss": 4.0585, + "step": 23 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 21.480648040771484, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.9472, + "step": 24 + }, + { + "epoch": 0.21929824561403508, + "grad_norm": 20.61946678161621, + "learning_rate": 1.25e-06, + "loss": 3.8879, + "step": 25 + }, + { + "epoch": 0.22807017543859648, + "grad_norm": 19.578271865844727, + "learning_rate": 1.3e-06, + "loss": 3.6783, + "step": 26 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 17.418983459472656, + "learning_rate": 1.3500000000000002e-06, + "loss": 3.6826, + "step": 27 + }, + { + "epoch": 0.24561403508771928, + "grad_norm": 18.160301208496094, + "learning_rate": 1.4000000000000001e-06, + "loss": 3.478, + "step": 28 + }, + { + "epoch": 0.2543859649122807, + "grad_norm": 17.573204040527344, + "learning_rate": 1.45e-06, + "loss": 3.459, + "step": 29 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 17.1265869140625, + "learning_rate": 1.5e-06, + "loss": 3.3999, + "step": 30 + }, + { + "epoch": 0.2719298245614035, + "grad_norm": 15.527145385742188, + "learning_rate": 1.5500000000000002e-06, + "loss": 3.2817, + "step": 31 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 14.773847579956055, + "learning_rate": 1.6000000000000001e-06, + "loss": 3.234, + "step": 32 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 12.039301872253418, + "learning_rate": 1.6500000000000003e-06, + "loss": 3.132, + "step": 33 + }, + { + "epoch": 0.2982456140350877, + "grad_norm": 9.217979431152344, + "learning_rate": 1.7000000000000002e-06, + "loss": 3.0548, + "step": 34 + }, + { + "epoch": 0.30701754385964913, + "grad_norm": 7.575639724731445, + "learning_rate": 1.75e-06, + "loss": 2.9529, + "step": 35 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 7.496004104614258, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.8967, + "step": 36 + }, + { + "epoch": 0.32456140350877194, + "grad_norm": 7.45414924621582, + "learning_rate": 1.85e-06, + "loss": 2.8837, + "step": 37 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 8.555658340454102, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.7473, + "step": 38 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 10.03805160522461, + "learning_rate": 1.9500000000000004e-06, + "loss": 2.7355, + "step": 39 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 9.30649471282959, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.6587, + "step": 40 + }, + { + "epoch": 0.35964912280701755, + "grad_norm": 8.510339736938477, + "learning_rate": 2.05e-06, + "loss": 2.5977, + "step": 41 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 4.709080696105957, + "learning_rate": 2.1000000000000002e-06, + "loss": 2.6286, + "step": 42 + }, + { + "epoch": 0.37719298245614036, + "grad_norm": 5.128961086273193, + "learning_rate": 2.15e-06, + "loss": 2.4558, + "step": 43 + }, + { + "epoch": 0.38596491228070173, + "grad_norm": 5.190136432647705, + "learning_rate": 2.2e-06, + "loss": 2.4432, + "step": 44 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 4.893551349639893, + "learning_rate": 2.25e-06, + "loss": 2.4939, + "step": 45 + }, + { + "epoch": 0.40350877192982454, + "grad_norm": 5.2434983253479, + "learning_rate": 2.3000000000000004e-06, + "loss": 2.3381, + "step": 46 + }, + { + "epoch": 0.41228070175438597, + "grad_norm": 5.122412204742432, + "learning_rate": 2.35e-06, + "loss": 2.313, + "step": 47 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 4.577274799346924, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.2236, + "step": 48 + }, + { + "epoch": 0.4298245614035088, + "grad_norm": 4.722769737243652, + "learning_rate": 2.4500000000000003e-06, + "loss": 2.1987, + "step": 49 + }, + { + "epoch": 0.43859649122807015, + "grad_norm": 5.059235095977783, + "learning_rate": 2.5e-06, + "loss": 2.1415, + "step": 50 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 4.454439640045166, + "learning_rate": 2.55e-06, + "loss": 2.0466, + "step": 51 + }, + { + "epoch": 0.45614035087719296, + "grad_norm": 4.94586706161499, + "learning_rate": 2.6e-06, + "loss": 1.8762, + "step": 52 + }, + { + "epoch": 0.4649122807017544, + "grad_norm": 4.704402446746826, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.8012, + "step": 53 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 6.125903129577637, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.7669, + "step": 54 + }, + { + "epoch": 0.4824561403508772, + "grad_norm": 4.5356059074401855, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.6607, + "step": 55 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 6.56803035736084, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.6291, + "step": 56 + }, + { + "epoch": 0.5, + "grad_norm": 4.910050392150879, + "learning_rate": 2.85e-06, + "loss": 1.5545, + "step": 57 + }, + { + "epoch": 0.5087719298245614, + "grad_norm": 8.733433723449707, + "learning_rate": 2.9e-06, + "loss": 1.4206, + "step": 58 + }, + { + "epoch": 0.5175438596491229, + "grad_norm": 8.582486152648926, + "learning_rate": 2.95e-06, + "loss": 1.3912, + "step": 59 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 13.710689544677734, + "learning_rate": 3e-06, + "loss": 1.3297, + "step": 60 + }, + { + "epoch": 0.5350877192982456, + "grad_norm": 23.400312423706055, + "learning_rate": 3.05e-06, + "loss": 1.296, + "step": 61 + }, + { + "epoch": 0.543859649122807, + "grad_norm": 5.678805351257324, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.2259, + "step": 62 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 14.700899124145508, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.1087, + "step": 63 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 19.38919448852539, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.1805, + "step": 64 + }, + { + "epoch": 0.5701754385964912, + "grad_norm": 8.460039138793945, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.0963, + "step": 65 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 13.371014595031738, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.0627, + "step": 66 + }, + { + "epoch": 0.5877192982456141, + "grad_norm": 22.380569458007812, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0869, + "step": 67 + }, + { + "epoch": 0.5964912280701754, + "grad_norm": 5.780513286590576, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9991, + "step": 68 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 19.850841522216797, + "learning_rate": 3.45e-06, + "loss": 0.9683, + "step": 69 + }, + { + "epoch": 0.6140350877192983, + "grad_norm": 17.160703659057617, + "learning_rate": 3.5e-06, + "loss": 0.845, + "step": 70 + }, + { + "epoch": 0.6228070175438597, + "grad_norm": 14.264311790466309, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8059, + "step": 71 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 26.39459991455078, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.85, + "step": 72 + }, + { + "epoch": 0.6403508771929824, + "grad_norm": 51.10348892211914, + "learning_rate": 3.65e-06, + "loss": 0.9755, + "step": 73 + }, + { + "epoch": 0.6491228070175439, + "grad_norm": 28.795856475830078, + "learning_rate": 3.7e-06, + "loss": 0.8966, + "step": 74 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 4.6617937088012695, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7716, + "step": 75 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 15.729666709899902, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.7578, + "step": 76 + }, + { + "epoch": 0.6754385964912281, + "grad_norm": 7.109970569610596, + "learning_rate": 3.85e-06, + "loss": 0.7055, + "step": 77 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 20.84659194946289, + "learning_rate": 3.900000000000001e-06, + "loss": 0.7458, + "step": 78 + }, + { + "epoch": 0.6929824561403509, + "grad_norm": 21.601303100585938, + "learning_rate": 3.95e-06, + "loss": 0.6879, + "step": 79 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 3.6914751529693604, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6179, + "step": 80 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 16.539325714111328, + "learning_rate": 4.05e-06, + "loss": 0.5716, + "step": 81 + }, + { + "epoch": 0.7192982456140351, + "grad_norm": 13.931925773620605, + "learning_rate": 4.1e-06, + "loss": 0.558, + "step": 82 + }, + { + "epoch": 0.7280701754385965, + "grad_norm": 10.52951717376709, + "learning_rate": 4.15e-06, + "loss": 0.6018, + "step": 83 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 17.337060928344727, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.5501, + "step": 84 + }, + { + "epoch": 0.7456140350877193, + "grad_norm": 13.500468254089355, + "learning_rate": 4.25e-06, + "loss": 0.5214, + "step": 85 + }, + { + "epoch": 0.7543859649122807, + "grad_norm": 10.290645599365234, + "learning_rate": 4.3e-06, + "loss": 0.4996, + "step": 86 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 9.757556915283203, + "learning_rate": 4.350000000000001e-06, + "loss": 0.498, + "step": 87 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 9.325140953063965, + "learning_rate": 4.4e-06, + "loss": 0.4721, + "step": 88 + }, + { + "epoch": 0.7807017543859649, + "grad_norm": 2.9322128295898438, + "learning_rate": 4.450000000000001e-06, + "loss": 0.4528, + "step": 89 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 10.484073638916016, + "learning_rate": 4.5e-06, + "loss": 0.445, + "step": 90 + }, + { + "epoch": 0.7982456140350878, + "grad_norm": 32.7827262878418, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.5105, + "step": 91 + }, + { + "epoch": 0.8070175438596491, + "grad_norm": 2.8477306365966797, + "learning_rate": 4.600000000000001e-06, + "loss": 0.4117, + "step": 92 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 2.7680225372314453, + "learning_rate": 4.65e-06, + "loss": 0.3653, + "step": 93 + }, + { + "epoch": 0.8245614035087719, + "grad_norm": 2.6512742042541504, + "learning_rate": 4.7e-06, + "loss": 0.3878, + "step": 94 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 6.453914165496826, + "learning_rate": 4.75e-06, + "loss": 0.3611, + "step": 95 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 3.4594080448150635, + "learning_rate": 4.800000000000001e-06, + "loss": 0.3817, + "step": 96 + }, + { + "epoch": 0.8508771929824561, + "grad_norm": 3.6144917011260986, + "learning_rate": 4.85e-06, + "loss": 0.3618, + "step": 97 + }, + { + "epoch": 0.8596491228070176, + "grad_norm": 5.349407196044922, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.3218, + "step": 98 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 13.671236991882324, + "learning_rate": 4.95e-06, + "loss": 0.3329, + "step": 99 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 5.84046745300293, + "learning_rate": 5e-06, + "loss": 0.2967, + "step": 100 + }, + { + "epoch": 0.8859649122807017, + "grad_norm": 14.005338668823242, + "learning_rate": 4.999963827125897e-06, + "loss": 0.303, + "step": 101 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 9.18114185333252, + "learning_rate": 4.999855309550366e-06, + "loss": 0.2762, + "step": 102 + }, + { + "epoch": 0.9035087719298246, + "grad_norm": 3.0800487995147705, + "learning_rate": 4.999674450413725e-06, + "loss": 0.2628, + "step": 103 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 82.03578186035156, + "learning_rate": 4.999421254949728e-06, + "loss": 0.4065, + "step": 104 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 77.66315460205078, + "learning_rate": 4.99909573048542e-06, + "loss": 0.4307, + "step": 105 + }, + { + "epoch": 0.9298245614035088, + "grad_norm": 18.28767967224121, + "learning_rate": 4.998697886440927e-06, + "loss": 0.2571, + "step": 106 + }, + { + "epoch": 0.9385964912280702, + "grad_norm": 5.960445880889893, + "learning_rate": 4.998227734329177e-06, + "loss": 0.2847, + "step": 107 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 5.437699794769287, + "learning_rate": 4.9976852877555755e-06, + "loss": 0.2728, + "step": 108 + }, + { + "epoch": 0.956140350877193, + "grad_norm": 3.379631280899048, + "learning_rate": 4.997070562417602e-06, + "loss": 0.2467, + "step": 109 + }, + { + "epoch": 0.9649122807017544, + "grad_norm": 3.1625075340270996, + "learning_rate": 4.996383576104362e-06, + "loss": 0.2273, + "step": 110 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 15.588600158691406, + "learning_rate": 4.995624348696071e-06, + "loss": 0.2486, + "step": 111 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 2.631044387817383, + "learning_rate": 4.9947929021634815e-06, + "loss": 0.1964, + "step": 112 + }, + { + "epoch": 0.9912280701754386, + "grad_norm": 4.706504821777344, + "learning_rate": 4.993889260567239e-06, + "loss": 0.1901, + "step": 113 + }, + { + "epoch": 1.0, + "grad_norm": 10.368465423583984, + "learning_rate": 4.9929134500571954e-06, + "loss": 0.1996, + "step": 114 + }, + { + "epoch": 1.0087719298245614, + "grad_norm": 30.44986343383789, + "learning_rate": 4.991865498871647e-06, + "loss": 0.2606, + "step": 115 + }, + { + "epoch": 1.0175438596491229, + "grad_norm": 14.421515464782715, + "learning_rate": 4.99074543733652e-06, + "loss": 0.2394, + "step": 116 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 14.072005271911621, + "learning_rate": 4.989553297864489e-06, + "loss": 0.2288, + "step": 117 + }, + { + "epoch": 1.0350877192982457, + "grad_norm": 4.395325660705566, + "learning_rate": 4.988289114954045e-06, + "loss": 0.2129, + "step": 118 + }, + { + "epoch": 1.043859649122807, + "grad_norm": 7.286703586578369, + "learning_rate": 4.986952925188489e-06, + "loss": 0.186, + "step": 119 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 8.332784652709961, + "learning_rate": 4.98554476723488e-06, + "loss": 0.178, + "step": 120 + }, + { + "epoch": 1.0614035087719298, + "grad_norm": 1.3646447658538818, + "learning_rate": 4.984064681842917e-06, + "loss": 0.1687, + "step": 121 + }, + { + "epoch": 1.0701754385964912, + "grad_norm": 4.494940757751465, + "learning_rate": 4.982512711843753e-06, + "loss": 0.1881, + "step": 122 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 3.3929836750030518, + "learning_rate": 4.980888902148757e-06, + "loss": 0.1764, + "step": 123 + }, + { + "epoch": 1.087719298245614, + "grad_norm": 1.8281155824661255, + "learning_rate": 4.979193299748225e-06, + "loss": 0.1602, + "step": 124 + }, + { + "epoch": 1.0964912280701755, + "grad_norm": 3.494239568710327, + "learning_rate": 4.977425953710005e-06, + "loss": 0.1729, + "step": 125 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 1.500410556793213, + "learning_rate": 4.975586915178084e-06, + "loss": 0.1666, + "step": 126 + }, + { + "epoch": 1.1140350877192982, + "grad_norm": 1.4680222272872925, + "learning_rate": 4.973676237371111e-06, + "loss": 0.159, + "step": 127 + }, + { + "epoch": 1.1228070175438596, + "grad_norm": 3.0383460521698, + "learning_rate": 4.971693975580851e-06, + "loss": 0.1484, + "step": 128 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 3.74821138381958, + "learning_rate": 4.969640187170591e-06, + "loss": 0.1586, + "step": 129 + }, + { + "epoch": 1.1403508771929824, + "grad_norm": 4.682602405548096, + "learning_rate": 4.967514931573473e-06, + "loss": 0.1619, + "step": 130 + }, + { + "epoch": 1.1491228070175439, + "grad_norm": 3.90673565864563, + "learning_rate": 4.965318270290779e-06, + "loss": 0.164, + "step": 131 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 2.2017388343811035, + "learning_rate": 4.963050266890152e-06, + "loss": 0.1499, + "step": 132 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 2.4211816787719727, + "learning_rate": 4.960710987003753e-06, + "loss": 0.1387, + "step": 133 + }, + { + "epoch": 1.1754385964912282, + "grad_norm": 1.7753759622573853, + "learning_rate": 4.958300498326363e-06, + "loss": 0.1441, + "step": 134 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 1.5529910326004028, + "learning_rate": 4.955818870613425e-06, + "loss": 0.1304, + "step": 135 + }, + { + "epoch": 1.1929824561403508, + "grad_norm": 2.090593099594116, + "learning_rate": 4.953266175679023e-06, + "loss": 0.1419, + "step": 136 + }, + { + "epoch": 1.2017543859649122, + "grad_norm": 2.7141878604888916, + "learning_rate": 4.95064248739381e-06, + "loss": 0.1444, + "step": 137 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 2.3690481185913086, + "learning_rate": 4.947947881682861e-06, + "loss": 0.1383, + "step": 138 + }, + { + "epoch": 1.219298245614035, + "grad_norm": 2.2403147220611572, + "learning_rate": 4.945182436523482e-06, + "loss": 0.1418, + "step": 139 + }, + { + "epoch": 1.2280701754385965, + "grad_norm": 1.3939160108566284, + "learning_rate": 4.942346231942955e-06, + "loss": 0.1307, + "step": 140 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 11.276732444763184, + "learning_rate": 4.939439350016214e-06, + "loss": 0.1397, + "step": 141 + }, + { + "epoch": 1.2456140350877192, + "grad_norm": 8.260516166687012, + "learning_rate": 4.9364618748634794e-06, + "loss": 0.1426, + "step": 142 + }, + { + "epoch": 1.2543859649122808, + "grad_norm": 2.09720516204834, + "learning_rate": 4.933413892647819e-06, + "loss": 0.1323, + "step": 143 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.802125334739685, + "learning_rate": 4.9302954915726535e-06, + "loss": 0.1304, + "step": 144 + }, + { + "epoch": 1.2719298245614035, + "grad_norm": 1.7151471376419067, + "learning_rate": 4.927106761879207e-06, + "loss": 0.1264, + "step": 145 + }, + { + "epoch": 1.280701754385965, + "grad_norm": 1.6970336437225342, + "learning_rate": 4.923847795843894e-06, + "loss": 0.1227, + "step": 146 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 16.60441017150879, + "learning_rate": 4.920518687775647e-06, + "loss": 0.1606, + "step": 147 + }, + { + "epoch": 1.2982456140350878, + "grad_norm": 6.470354080200195, + "learning_rate": 4.917119534013194e-06, + "loss": 0.1447, + "step": 148 + }, + { + "epoch": 1.3070175438596492, + "grad_norm": 1.4908231496810913, + "learning_rate": 4.913650432922264e-06, + "loss": 0.1343, + "step": 149 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 3.19964861869812, + "learning_rate": 4.91011148489274e-06, + "loss": 0.1354, + "step": 150 + }, + { + "epoch": 1.3245614035087718, + "grad_norm": 2.6052839756011963, + "learning_rate": 4.906502792335761e-06, + "loss": 0.1342, + "step": 151 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.0719165802001953, + "learning_rate": 4.9028244596807525e-06, + "loss": 0.1359, + "step": 152 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.8086919784545898, + "learning_rate": 4.899076593372405e-06, + "loss": 0.1279, + "step": 153 + }, + { + "epoch": 1.3508771929824561, + "grad_norm": 1.0056848526000977, + "learning_rate": 4.8952593018675955e-06, + "loss": 0.1162, + "step": 154 + }, + { + "epoch": 1.3596491228070176, + "grad_norm": 5.72553014755249, + "learning_rate": 4.891372695632249e-06, + "loss": 0.1315, + "step": 155 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.522894024848938, + "learning_rate": 4.887416887138139e-06, + "loss": 0.1266, + "step": 156 + }, + { + "epoch": 1.3771929824561404, + "grad_norm": 2.019472122192383, + "learning_rate": 4.883391990859635e-06, + "loss": 0.1262, + "step": 157 + }, + { + "epoch": 1.3859649122807016, + "grad_norm": 1.8594422340393066, + "learning_rate": 4.879298123270391e-06, + "loss": 0.125, + "step": 158 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 1.365377426147461, + "learning_rate": 4.8751354028399725e-06, + "loss": 0.1218, + "step": 159 + }, + { + "epoch": 1.4035087719298245, + "grad_norm": 3.553309917449951, + "learning_rate": 4.870903950030429e-06, + "loss": 0.1272, + "step": 160 + }, + { + "epoch": 1.412280701754386, + "grad_norm": 2.1770920753479004, + "learning_rate": 4.866603887292809e-06, + "loss": 0.1213, + "step": 161 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 1.6058955192565918, + "learning_rate": 4.862235339063613e-06, + "loss": 0.1173, + "step": 162 + }, + { + "epoch": 1.4298245614035088, + "grad_norm": 1.3208314180374146, + "learning_rate": 4.857798431761199e-06, + "loss": 0.1183, + "step": 163 + }, + { + "epoch": 1.4385964912280702, + "grad_norm": 1.282729983329773, + "learning_rate": 4.853293293782118e-06, + "loss": 0.1209, + "step": 164 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 1.3838152885437012, + "learning_rate": 4.848720055497401e-06, + "loss": 0.1198, + "step": 165 + }, + { + "epoch": 1.456140350877193, + "grad_norm": 1.2930737733840942, + "learning_rate": 4.844078849248785e-06, + "loss": 0.1268, + "step": 166 + }, + { + "epoch": 1.4649122807017543, + "grad_norm": 1.7022266387939453, + "learning_rate": 4.839369809344888e-06, + "loss": 0.1198, + "step": 167 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.0927815437316895, + "learning_rate": 4.834593072057313e-06, + "loss": 0.1132, + "step": 168 + }, + { + "epoch": 1.4824561403508771, + "grad_norm": 0.9326333999633789, + "learning_rate": 4.829748775616716e-06, + "loss": 0.1193, + "step": 169 + }, + { + "epoch": 1.4912280701754386, + "grad_norm": 1.3564742803573608, + "learning_rate": 4.8248370602087954e-06, + "loss": 0.118, + "step": 170 + }, + { + "epoch": 1.5, + "grad_norm": 1.19778573513031, + "learning_rate": 4.819858067970243e-06, + "loss": 0.1122, + "step": 171 + }, + { + "epoch": 1.5087719298245614, + "grad_norm": 2.8438351154327393, + "learning_rate": 4.814811942984625e-06, + "loss": 0.1217, + "step": 172 + }, + { + "epoch": 1.5175438596491229, + "grad_norm": 1.0701063871383667, + "learning_rate": 4.809698831278217e-06, + "loss": 0.1114, + "step": 173 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.9053553938865662, + "learning_rate": 4.804518880815776e-06, + "loss": 0.1178, + "step": 174 + }, + { + "epoch": 1.5350877192982457, + "grad_norm": 0.42274603247642517, + "learning_rate": 4.799272241496259e-06, + "loss": 0.1091, + "step": 175 + }, + { + "epoch": 1.543859649122807, + "grad_norm": 0.8576470017433167, + "learning_rate": 4.793959065148484e-06, + "loss": 0.1134, + "step": 176 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.5910662412643433, + "learning_rate": 4.78857950552674e-06, + "loss": 0.1148, + "step": 177 + }, + { + "epoch": 1.5614035087719298, + "grad_norm": 0.8761632442474365, + "learning_rate": 4.783133718306331e-06, + "loss": 0.1125, + "step": 178 + }, + { + "epoch": 1.5701754385964912, + "grad_norm": 1.9190795421600342, + "learning_rate": 4.777621861079079e-06, + "loss": 0.1148, + "step": 179 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.6199957728385925, + "learning_rate": 4.772044093348757e-06, + "loss": 0.1097, + "step": 180 + }, + { + "epoch": 1.587719298245614, + "grad_norm": 1.562089443206787, + "learning_rate": 4.766400576526479e-06, + "loss": 0.1097, + "step": 181 + }, + { + "epoch": 1.5964912280701755, + "grad_norm": 1.4957091808319092, + "learning_rate": 4.760691473926021e-06, + "loss": 0.1216, + "step": 182 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.9863570332527161, + "learning_rate": 4.754916950759105e-06, + "loss": 0.1122, + "step": 183 + }, + { + "epoch": 1.6140350877192984, + "grad_norm": 0.5803346633911133, + "learning_rate": 4.749077174130609e-06, + "loss": 0.1103, + "step": 184 + }, + { + "epoch": 1.6228070175438596, + "grad_norm": 1.8789891004562378, + "learning_rate": 4.743172313033738e-06, + "loss": 0.1191, + "step": 185 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.8731380105018616, + "learning_rate": 4.7372025383451285e-06, + "loss": 0.1154, + "step": 186 + }, + { + "epoch": 1.6403508771929824, + "grad_norm": 1.3535627126693726, + "learning_rate": 4.7311680228199075e-06, + "loss": 0.1123, + "step": 187 + }, + { + "epoch": 1.6491228070175439, + "grad_norm": 0.7211089134216309, + "learning_rate": 4.725068941086693e-06, + "loss": 0.1134, + "step": 188 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 1.4752328395843506, + "learning_rate": 4.718905469642534e-06, + "loss": 0.1185, + "step": 189 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.9822680354118347, + "learning_rate": 4.712677786847814e-06, + "loss": 0.1146, + "step": 190 + }, + { + "epoch": 1.6754385964912282, + "grad_norm": 1.1308330297470093, + "learning_rate": 4.706386072921083e-06, + "loss": 0.1061, + "step": 191 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 5.331939697265625, + "learning_rate": 4.70003050993384e-06, + "loss": 0.1153, + "step": 192 + }, + { + "epoch": 1.692982456140351, + "grad_norm": 0.6911673545837402, + "learning_rate": 4.6936112818052674e-06, + "loss": 0.1098, + "step": 193 + }, + { + "epoch": 1.7017543859649122, + "grad_norm": 0.5160980224609375, + "learning_rate": 4.687128574296912e-06, + "loss": 0.1073, + "step": 194 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 1.5724798440933228, + "learning_rate": 4.680582575007303e-06, + "loss": 0.121, + "step": 195 + }, + { + "epoch": 1.719298245614035, + "grad_norm": 1.3960011005401611, + "learning_rate": 4.6739734733665275e-06, + "loss": 0.1145, + "step": 196 + }, + { + "epoch": 1.7280701754385965, + "grad_norm": 1.4949183464050293, + "learning_rate": 4.6673014606307465e-06, + "loss": 0.1166, + "step": 197 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 1.6873422861099243, + "learning_rate": 4.660566729876661e-06, + "loss": 0.1115, + "step": 198 + }, + { + "epoch": 1.7456140350877192, + "grad_norm": 1.3443641662597656, + "learning_rate": 4.653769475995926e-06, + "loss": 0.1119, + "step": 199 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 0.807525098323822, + "learning_rate": 4.646909895689508e-06, + "loss": 0.1059, + "step": 200 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 1.589316964149475, + "learning_rate": 4.639988187461995e-06, + "loss": 0.1151, + "step": 201 + }, + { + "epoch": 1.7719298245614035, + "grad_norm": 2.474756956100464, + "learning_rate": 4.633004551615851e-06, + "loss": 0.116, + "step": 202 + }, + { + "epoch": 1.780701754385965, + "grad_norm": 0.6210195422172546, + "learning_rate": 4.62595919024562e-06, + "loss": 0.1097, + "step": 203 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.7217905521392822, + "learning_rate": 4.618852307232078e-06, + "loss": 0.1117, + "step": 204 + }, + { + "epoch": 1.7982456140350878, + "grad_norm": 1.551251769065857, + "learning_rate": 4.611684108236334e-06, + "loss": 0.113, + "step": 205 + }, + { + "epoch": 1.807017543859649, + "grad_norm": 0.6619828939437866, + "learning_rate": 4.604454800693874e-06, + "loss": 0.113, + "step": 206 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.9461805820465088, + "learning_rate": 4.597164593808564e-06, + "loss": 0.1093, + "step": 207 + }, + { + "epoch": 1.8245614035087718, + "grad_norm": 1.2926547527313232, + "learning_rate": 4.589813698546592e-06, + "loss": 0.1128, + "step": 208 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.8754212856292725, + "learning_rate": 4.582402327630368e-06, + "loss": 0.1104, + "step": 209 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.846051812171936, + "learning_rate": 4.574930695532357e-06, + "loss": 0.1105, + "step": 210 + }, + { + "epoch": 1.8508771929824561, + "grad_norm": 1.3332515954971313, + "learning_rate": 4.567399018468889e-06, + "loss": 0.1101, + "step": 211 + }, + { + "epoch": 1.8596491228070176, + "grad_norm": 0.8729192614555359, + "learning_rate": 4.5598075143938855e-06, + "loss": 0.1081, + "step": 212 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.8618345260620117, + "learning_rate": 4.552156402992567e-06, + "loss": 0.1059, + "step": 213 + }, + { + "epoch": 1.8771929824561404, + "grad_norm": 1.2135930061340332, + "learning_rate": 4.544445905675082e-06, + "loss": 0.1105, + "step": 214 + }, + { + "epoch": 1.8859649122807016, + "grad_norm": 0.8405666351318359, + "learning_rate": 4.536676245570111e-06, + "loss": 0.1118, + "step": 215 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.42860639095306396, + "learning_rate": 4.528847647518403e-06, + "loss": 0.1093, + "step": 216 + }, + { + "epoch": 1.9035087719298245, + "grad_norm": 1.1538206338882446, + "learning_rate": 4.520960338066271e-06, + "loss": 0.1088, + "step": 217 + }, + { + "epoch": 1.912280701754386, + "grad_norm": 0.5870749354362488, + "learning_rate": 4.513014545459038e-06, + "loss": 0.1061, + "step": 218 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.7279748916625977, + "learning_rate": 4.505010499634427e-06, + "loss": 0.1032, + "step": 219 + }, + { + "epoch": 1.9298245614035088, + "grad_norm": 0.6331414580345154, + "learning_rate": 4.4969484322159125e-06, + "loss": 0.1109, + "step": 220 + }, + { + "epoch": 1.9385964912280702, + "grad_norm": 0.9024543166160583, + "learning_rate": 4.488828576506014e-06, + "loss": 0.1094, + "step": 221 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 3.540376901626587, + "learning_rate": 4.480651167479545e-06, + "loss": 0.1154, + "step": 222 + }, + { + "epoch": 1.956140350877193, + "grad_norm": 0.9506739377975464, + "learning_rate": 4.472416441776817e-06, + "loss": 0.108, + "step": 223 + }, + { + "epoch": 1.9649122807017543, + "grad_norm": 0.6585081815719604, + "learning_rate": 4.464124637696786e-06, + "loss": 0.1033, + "step": 224 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 1.143038034439087, + "learning_rate": 4.455775995190161e-06, + "loss": 0.1092, + "step": 225 + }, + { + "epoch": 1.9824561403508771, + "grad_norm": 1.148261547088623, + "learning_rate": 4.4473707558524555e-06, + "loss": 0.1076, + "step": 226 + }, + { + "epoch": 1.9912280701754386, + "grad_norm": 0.7375811338424683, + "learning_rate": 4.438909162917003e-06, + "loss": 0.108, + "step": 227 + }, + { + "epoch": 2.0, + "grad_norm": 0.5254591703414917, + "learning_rate": 4.430391461247911e-06, + "loss": 0.1079, + "step": 228 + }, + { + "epoch": 2.008771929824561, + "grad_norm": 1.0198495388031006, + "learning_rate": 4.42181789733298e-06, + "loss": 0.1083, + "step": 229 + }, + { + "epoch": 2.017543859649123, + "grad_norm": 0.9234157800674438, + "learning_rate": 4.413188719276569e-06, + "loss": 0.1084, + "step": 230 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 0.5215068459510803, + "learning_rate": 4.404504176792414e-06, + "loss": 0.1067, + "step": 231 + }, + { + "epoch": 2.0350877192982457, + "grad_norm": 0.9296736121177673, + "learning_rate": 4.3957645211964065e-06, + "loss": 0.1066, + "step": 232 + }, + { + "epoch": 2.043859649122807, + "grad_norm": 0.8660671710968018, + "learning_rate": 4.386970005399314e-06, + "loss": 0.108, + "step": 233 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 0.6014883518218994, + "learning_rate": 4.378120883899467e-06, + "loss": 0.1068, + "step": 234 + }, + { + "epoch": 2.06140350877193, + "grad_norm": 0.6370371580123901, + "learning_rate": 4.369217412775393e-06, + "loss": 0.1076, + "step": 235 + }, + { + "epoch": 2.0701754385964914, + "grad_norm": 0.9806828498840332, + "learning_rate": 4.360259849678402e-06, + "loss": 0.1071, + "step": 236 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 0.6093440651893616, + "learning_rate": 4.351248453825137e-06, + "loss": 0.1038, + "step": 237 + }, + { + "epoch": 2.087719298245614, + "grad_norm": 1.3494842052459717, + "learning_rate": 4.3421834859900695e-06, + "loss": 0.1105, + "step": 238 + }, + { + "epoch": 2.0964912280701755, + "grad_norm": 0.7621576189994812, + "learning_rate": 4.333065208497949e-06, + "loss": 0.1048, + "step": 239 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.5918282866477966, + "learning_rate": 4.3238938852162195e-06, + "loss": 0.1086, + "step": 240 + }, + { + "epoch": 2.1140350877192984, + "grad_norm": 0.7048676609992981, + "learning_rate": 4.314669781547379e-06, + "loss": 0.1061, + "step": 241 + }, + { + "epoch": 2.1228070175438596, + "grad_norm": 1.0750821828842163, + "learning_rate": 4.305393164421301e-06, + "loss": 0.1082, + "step": 242 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 0.6171414852142334, + "learning_rate": 4.296064302287507e-06, + "loss": 0.1039, + "step": 243 + }, + { + "epoch": 2.1403508771929824, + "grad_norm": 0.8080905079841614, + "learning_rate": 4.286683465107403e-06, + "loss": 0.1069, + "step": 244 + }, + { + "epoch": 2.1491228070175437, + "grad_norm": 0.5281466245651245, + "learning_rate": 4.277250924346461e-06, + "loss": 0.1069, + "step": 245 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.8070254325866699, + "learning_rate": 4.267766952966369e-06, + "loss": 0.1061, + "step": 246 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.8560577630996704, + "learning_rate": 4.25823182541713e-06, + "loss": 0.1116, + "step": 247 + }, + { + "epoch": 2.175438596491228, + "grad_norm": 0.7772330045700073, + "learning_rate": 4.2486458176291176e-06, + "loss": 0.1092, + "step": 248 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 0.814601719379425, + "learning_rate": 4.239009207005096e-06, + "loss": 0.1093, + "step": 249 + }, + { + "epoch": 2.192982456140351, + "grad_norm": 0.957789957523346, + "learning_rate": 4.2293222724121855e-06, + "loss": 0.1075, + "step": 250 + }, + { + "epoch": 2.2017543859649122, + "grad_norm": 0.500062108039856, + "learning_rate": 4.219585294173799e-06, + "loss": 0.1048, + "step": 251 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.3866419792175293, + "learning_rate": 4.209798554061527e-06, + "loss": 0.1074, + "step": 252 + }, + { + "epoch": 2.219298245614035, + "grad_norm": 1.1853291988372803, + "learning_rate": 4.199962335286985e-06, + "loss": 0.1076, + "step": 253 + }, + { + "epoch": 2.2280701754385963, + "grad_norm": 0.36602887511253357, + "learning_rate": 4.1900769224936125e-06, + "loss": 0.108, + "step": 254 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.2530711889266968, + "learning_rate": 4.180142601748447e-06, + "loss": 0.1041, + "step": 255 + }, + { + "epoch": 2.245614035087719, + "grad_norm": 1.3067054748535156, + "learning_rate": 4.170159660533834e-06, + "loss": 0.1087, + "step": 256 + }, + { + "epoch": 2.254385964912281, + "grad_norm": 0.3442043960094452, + "learning_rate": 4.160128387739114e-06, + "loss": 0.1099, + "step": 257 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 1.174796462059021, + "learning_rate": 4.150049073652262e-06, + "loss": 0.1063, + "step": 258 + }, + { + "epoch": 2.2719298245614037, + "grad_norm": 0.5719411969184875, + "learning_rate": 4.1399220099514845e-06, + "loss": 0.1043, + "step": 259 + }, + { + "epoch": 2.280701754385965, + "grad_norm": 0.7268956303596497, + "learning_rate": 4.129747489696781e-06, + "loss": 0.1038, + "step": 260 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 0.7028316259384155, + "learning_rate": 4.119525807321467e-06, + "loss": 0.1052, + "step": 261 + }, + { + "epoch": 2.2982456140350878, + "grad_norm": 1.015335202217102, + "learning_rate": 4.109257258623644e-06, + "loss": 0.1116, + "step": 262 + }, + { + "epoch": 2.307017543859649, + "grad_norm": 0.7141755819320679, + "learning_rate": 4.098942140757646e-06, + "loss": 0.108, + "step": 263 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.7656403183937073, + "learning_rate": 4.0885807522254435e-06, + "loss": 0.1043, + "step": 264 + }, + { + "epoch": 2.324561403508772, + "grad_norm": 0.43293774127960205, + "learning_rate": 4.078173392867998e-06, + "loss": 0.1048, + "step": 265 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.6755763292312622, + "learning_rate": 4.0677203638565895e-06, + "loss": 0.1064, + "step": 266 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 0.9648827314376831, + "learning_rate": 4.0572219676841e-06, + "loss": 0.1088, + "step": 267 + }, + { + "epoch": 2.3508771929824563, + "grad_norm": 0.32724836468696594, + "learning_rate": 4.046678508156259e-06, + "loss": 0.1077, + "step": 268 + }, + { + "epoch": 2.3596491228070176, + "grad_norm": 0.4696657061576843, + "learning_rate": 4.036090290382855e-06, + "loss": 0.1067, + "step": 269 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.33901306986808777, + "learning_rate": 4.025457620768901e-06, + "loss": 0.105, + "step": 270 + }, + { + "epoch": 2.3771929824561404, + "grad_norm": 0.5703794360160828, + "learning_rate": 4.014780807005775e-06, + "loss": 0.1033, + "step": 271 + }, + { + "epoch": 2.3859649122807016, + "grad_norm": 0.9639355540275574, + "learning_rate": 4.004060158062306e-06, + "loss": 0.1041, + "step": 272 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 0.8851558566093445, + "learning_rate": 3.993295984175845e-06, + "loss": 0.1064, + "step": 273 + }, + { + "epoch": 2.4035087719298245, + "grad_norm": 0.5200062990188599, + "learning_rate": 3.982488596843276e-06, + "loss": 0.1056, + "step": 274 + }, + { + "epoch": 2.412280701754386, + "grad_norm": 1.160823106765747, + "learning_rate": 3.971638308812007e-06, + "loss": 0.1069, + "step": 275 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 1.0191210508346558, + "learning_rate": 3.9607454340709215e-06, + "loss": 0.1042, + "step": 276 + }, + { + "epoch": 2.4298245614035086, + "grad_norm": 0.37181487679481506, + "learning_rate": 3.949810287841289e-06, + "loss": 0.1062, + "step": 277 + }, + { + "epoch": 2.43859649122807, + "grad_norm": 0.9328593611717224, + "learning_rate": 3.9388331865676436e-06, + "loss": 0.1086, + "step": 278 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 0.8024734258651733, + "learning_rate": 3.927814447908625e-06, + "loss": 0.1051, + "step": 279 + }, + { + "epoch": 2.456140350877193, + "grad_norm": 0.9746696352958679, + "learning_rate": 3.916754390727795e-06, + "loss": 0.1041, + "step": 280 + }, + { + "epoch": 2.4649122807017543, + "grad_norm": 0.5457844138145447, + "learning_rate": 3.905653335084394e-06, + "loss": 0.1052, + "step": 281 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 1.0736924409866333, + "learning_rate": 3.8945116022240945e-06, + "loss": 0.1075, + "step": 282 + }, + { + "epoch": 2.482456140350877, + "grad_norm": 0.6335628032684326, + "learning_rate": 3.8833295145696964e-06, + "loss": 0.1036, + "step": 283 + }, + { + "epoch": 2.4912280701754383, + "grad_norm": 0.6909618377685547, + "learning_rate": 3.872107395711799e-06, + "loss": 0.1089, + "step": 284 + }, + { + "epoch": 2.5, + "grad_norm": 2.1871702671051025, + "learning_rate": 3.860845570399435e-06, + "loss": 0.1066, + "step": 285 + }, + { + "epoch": 2.5087719298245617, + "grad_norm": 0.5831722617149353, + "learning_rate": 3.849544364530678e-06, + "loss": 0.1055, + "step": 286 + }, + { + "epoch": 2.517543859649123, + "grad_norm": 0.5302637815475464, + "learning_rate": 3.838204105143204e-06, + "loss": 0.1057, + "step": 287 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.6348035931587219, + "learning_rate": 3.8268251204048335e-06, + "loss": 0.1089, + "step": 288 + }, + { + "epoch": 2.5350877192982457, + "grad_norm": 2.1932008266448975, + "learning_rate": 3.815407739604033e-06, + "loss": 0.1043, + "step": 289 + }, + { + "epoch": 2.543859649122807, + "grad_norm": 0.4388940930366516, + "learning_rate": 3.803952293140385e-06, + "loss": 0.1055, + "step": 290 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 0.6853339076042175, + "learning_rate": 3.7924591125150265e-06, + "loss": 0.1036, + "step": 291 + }, + { + "epoch": 2.56140350877193, + "grad_norm": 0.34744876623153687, + "learning_rate": 3.78092853032106e-06, + "loss": 0.1025, + "step": 292 + }, + { + "epoch": 2.5701754385964914, + "grad_norm": 0.9523847699165344, + "learning_rate": 3.769360880233922e-06, + "loss": 0.1067, + "step": 293 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 1.303745985031128, + "learning_rate": 3.7577564970017338e-06, + "loss": 0.1082, + "step": 294 + }, + { + "epoch": 2.587719298245614, + "grad_norm": 0.9468981623649597, + "learning_rate": 3.7461157164356103e-06, + "loss": 0.1055, + "step": 295 + }, + { + "epoch": 2.5964912280701755, + "grad_norm": 0.7204175591468811, + "learning_rate": 3.7344388753999434e-06, + "loss": 0.1055, + "step": 296 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 0.5110165476799011, + "learning_rate": 3.7227263118026537e-06, + "loss": 0.1092, + "step": 297 + }, + { + "epoch": 2.6140350877192984, + "grad_norm": 0.6483246088027954, + "learning_rate": 3.7109783645854116e-06, + "loss": 0.1078, + "step": 298 + }, + { + "epoch": 2.6228070175438596, + "grad_norm": 0.5058422684669495, + "learning_rate": 3.699195373713831e-06, + "loss": 0.1073, + "step": 299 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.4123518764972687, + "learning_rate": 3.6873776801676265e-06, + "loss": 0.1053, + "step": 300 + }, + { + "epoch": 2.6403508771929824, + "grad_norm": 1.0864709615707397, + "learning_rate": 3.675525625930751e-06, + "loss": 0.1048, + "step": 301 + }, + { + "epoch": 2.6491228070175437, + "grad_norm": 1.0264904499053955, + "learning_rate": 3.6636395539814975e-06, + "loss": 0.1059, + "step": 302 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 0.7724822163581848, + "learning_rate": 3.651719808282573e-06, + "loss": 0.1063, + "step": 303 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.7474755644798279, + "learning_rate": 3.6397667337711475e-06, + "loss": 0.1034, + "step": 304 + }, + { + "epoch": 2.675438596491228, + "grad_norm": 0.5628909468650818, + "learning_rate": 3.6277806763488666e-06, + "loss": 0.1026, + "step": 305 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.9070547819137573, + "learning_rate": 3.6157619828718477e-06, + "loss": 0.1031, + "step": 306 + }, + { + "epoch": 2.692982456140351, + "grad_norm": 0.6968091130256653, + "learning_rate": 3.603711001140641e-06, + "loss": 0.1068, + "step": 307 + }, + { + "epoch": 2.7017543859649122, + "grad_norm": 0.3764977753162384, + "learning_rate": 3.5916280798901604e-06, + "loss": 0.1038, + "step": 308 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 5.012625694274902, + "learning_rate": 3.5795135687795984e-06, + "loss": 0.1129, + "step": 309 + }, + { + "epoch": 2.719298245614035, + "grad_norm": 0.6745572686195374, + "learning_rate": 3.567367818382303e-06, + "loss": 0.1071, + "step": 310 + }, + { + "epoch": 2.7280701754385968, + "grad_norm": 1.0659606456756592, + "learning_rate": 3.555191180175634e-06, + "loss": 0.1067, + "step": 311 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 1.7312604188919067, + "learning_rate": 3.5429840065307924e-06, + "loss": 0.1101, + "step": 312 + }, + { + "epoch": 2.745614035087719, + "grad_norm": 1.100364327430725, + "learning_rate": 3.5307466507026223e-06, + "loss": 0.1098, + "step": 313 + }, + { + "epoch": 2.754385964912281, + "grad_norm": 1.0390428304672241, + "learning_rate": 3.5184794668193893e-06, + "loss": 0.1094, + "step": 314 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.3369971811771393, + "learning_rate": 3.5061828098725327e-06, + "loss": 0.1053, + "step": 315 + }, + { + "epoch": 2.7719298245614032, + "grad_norm": 0.6130257248878479, + "learning_rate": 3.4938570357063906e-06, + "loss": 0.106, + "step": 316 + }, + { + "epoch": 2.780701754385965, + "grad_norm": 0.6387595534324646, + "learning_rate": 3.481502501007904e-06, + "loss": 0.1044, + "step": 317 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 1.0731587409973145, + "learning_rate": 3.469119563296296e-06, + "loss": 0.1097, + "step": 318 + }, + { + "epoch": 2.7982456140350878, + "grad_norm": 0.8096229434013367, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.1076, + "step": 319 + }, + { + "epoch": 2.807017543859649, + "grad_norm": 0.5034844279289246, + "learning_rate": 3.444269913009912e-06, + "loss": 0.1071, + "step": 320 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 0.675139307975769, + "learning_rate": 3.4318039195417536e-06, + "loss": 0.1039, + "step": 321 + }, + { + "epoch": 2.824561403508772, + "grad_norm": 0.7330355644226074, + "learning_rate": 3.4193109612528972e-06, + "loss": 0.1044, + "step": 322 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.6558271646499634, + "learning_rate": 3.4067913996683115e-06, + "loss": 0.1051, + "step": 323 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.8411844372749329, + "learning_rate": 3.3942455970828146e-06, + "loss": 0.1063, + "step": 324 + }, + { + "epoch": 2.8508771929824563, + "grad_norm": 0.4817325174808502, + "learning_rate": 3.3816739165505964e-06, + "loss": 0.105, + "step": 325 + }, + { + "epoch": 2.8596491228070176, + "grad_norm": 0.424554705619812, + "learning_rate": 3.3690767218747104e-06, + "loss": 0.1037, + "step": 326 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 1.0054417848587036, + "learning_rate": 3.3564543775965475e-06, + "loss": 0.1058, + "step": 327 + }, + { + "epoch": 2.8771929824561404, + "grad_norm": 0.8984584808349609, + "learning_rate": 3.3438072489852837e-06, + "loss": 0.1079, + "step": 328 + }, + { + "epoch": 2.8859649122807016, + "grad_norm": 0.6779558062553406, + "learning_rate": 3.331135702027311e-06, + "loss": 0.1046, + "step": 329 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.6931657195091248, + "learning_rate": 3.318440103415649e-06, + "loss": 0.1106, + "step": 330 + }, + { + "epoch": 2.9035087719298245, + "grad_norm": 0.705264151096344, + "learning_rate": 3.305720820539329e-06, + "loss": 0.104, + "step": 331 + }, + { + "epoch": 2.912280701754386, + "grad_norm": 0.7799407839775085, + "learning_rate": 3.2929782214727657e-06, + "loss": 0.1019, + "step": 332 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 0.7583760619163513, + "learning_rate": 3.2802126749651042e-06, + "loss": 0.1049, + "step": 333 + }, + { + "epoch": 2.9298245614035086, + "grad_norm": 0.6145837306976318, + "learning_rate": 3.2674245504295505e-06, + "loss": 0.104, + "step": 334 + }, + { + "epoch": 2.93859649122807, + "grad_norm": 0.5170779228210449, + "learning_rate": 3.254614217932679e-06, + "loss": 0.1024, + "step": 335 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.6850940585136414, + "learning_rate": 3.241782048183726e-06, + "loss": 0.1047, + "step": 336 + }, + { + "epoch": 2.956140350877193, + "grad_norm": 0.7307694554328918, + "learning_rate": 3.2289284125238597e-06, + "loss": 0.1032, + "step": 337 + }, + { + "epoch": 2.9649122807017543, + "grad_norm": 0.3386179208755493, + "learning_rate": 3.216053682915436e-06, + "loss": 0.1037, + "step": 338 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 0.7565059065818787, + "learning_rate": 3.203158231931234e-06, + "loss": 0.1048, + "step": 339 + }, + { + "epoch": 2.982456140350877, + "grad_norm": 0.7902039289474487, + "learning_rate": 3.190242432743673e-06, + "loss": 0.1068, + "step": 340 + }, + { + "epoch": 2.9912280701754383, + "grad_norm": 0.42595192790031433, + "learning_rate": 3.177306659114015e-06, + "loss": 0.1039, + "step": 341 + }, + { + "epoch": 3.0, + "grad_norm": 1.1214542388916016, + "learning_rate": 3.164351285381549e-06, + "loss": 0.1062, + "step": 342 + }, + { + "epoch": 3.008771929824561, + "grad_norm": 0.7622955441474915, + "learning_rate": 3.1513766864527577e-06, + "loss": 0.1015, + "step": 343 + }, + { + "epoch": 3.017543859649123, + "grad_norm": 0.2676297724246979, + "learning_rate": 3.1383832377904676e-06, + "loss": 0.1037, + "step": 344 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 0.8695605397224426, + "learning_rate": 3.1253713154029857e-06, + "loss": 0.1056, + "step": 345 + }, + { + "epoch": 3.0350877192982457, + "grad_norm": 0.5875906944274902, + "learning_rate": 3.1123412958332155e-06, + "loss": 0.1067, + "step": 346 + }, + { + "epoch": 3.043859649122807, + "grad_norm": 0.7699372172355652, + "learning_rate": 3.0992935561477632e-06, + "loss": 0.1035, + "step": 347 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 0.5919204354286194, + "learning_rate": 3.0862284739260247e-06, + "loss": 0.1023, + "step": 348 + }, + { + "epoch": 3.06140350877193, + "grad_norm": 1.3211849927902222, + "learning_rate": 3.07314642724926e-06, + "loss": 0.1065, + "step": 349 + }, + { + "epoch": 3.0701754385964914, + "grad_norm": 0.6359637379646301, + "learning_rate": 3.0600477946896494e-06, + "loss": 0.106, + "step": 350 + }, + { + "epoch": 3.0789473684210527, + "grad_norm": 0.35776662826538086, + "learning_rate": 3.046932955299344e-06, + "loss": 0.1046, + "step": 351 + }, + { + "epoch": 3.087719298245614, + "grad_norm": 0.6657406687736511, + "learning_rate": 3.0338022885994904e-06, + "loss": 0.1076, + "step": 352 + }, + { + "epoch": 3.0964912280701755, + "grad_norm": 0.7587785720825195, + "learning_rate": 3.0206561745692512e-06, + "loss": 0.1043, + "step": 353 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 1.1258317232131958, + "learning_rate": 3.0074949936348084e-06, + "loss": 0.1043, + "step": 354 + }, + { + "epoch": 3.1140350877192984, + "grad_norm": 0.3570568263530731, + "learning_rate": 2.9943191266583564e-06, + "loss": 0.1032, + "step": 355 + }, + { + "epoch": 3.1228070175438596, + "grad_norm": 0.843485414981842, + "learning_rate": 2.981128954927075e-06, + "loss": 0.1045, + "step": 356 + }, + { + "epoch": 3.1315789473684212, + "grad_norm": 0.5719651579856873, + "learning_rate": 2.967924860142103e-06, + "loss": 0.1052, + "step": 357 + }, + { + "epoch": 3.1403508771929824, + "grad_norm": 2.20767879486084, + "learning_rate": 2.9547072244074853e-06, + "loss": 0.1078, + "step": 358 + }, + { + "epoch": 3.1491228070175437, + "grad_norm": 0.3715457022190094, + "learning_rate": 2.941476430219122e-06, + "loss": 0.1047, + "step": 359 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.7803200483322144, + "learning_rate": 2.928232860453694e-06, + "loss": 0.1029, + "step": 360 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.5198164582252502, + "learning_rate": 2.9149768983575884e-06, + "loss": 0.1032, + "step": 361 + }, + { + "epoch": 3.175438596491228, + "grad_norm": 0.7827185988426208, + "learning_rate": 2.9017089275358017e-06, + "loss": 0.1043, + "step": 362 + }, + { + "epoch": 3.1842105263157894, + "grad_norm": 0.4000351130962372, + "learning_rate": 2.8884293319408464e-06, + "loss": 0.1071, + "step": 363 + }, + { + "epoch": 3.192982456140351, + "grad_norm": 0.9913386106491089, + "learning_rate": 2.8751384958616318e-06, + "loss": 0.1022, + "step": 364 + }, + { + "epoch": 3.2017543859649122, + "grad_norm": 0.6975695490837097, + "learning_rate": 2.861836803912353e-06, + "loss": 0.1029, + "step": 365 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 0.2372695654630661, + "learning_rate": 2.8485246410213497e-06, + "loss": 0.1015, + "step": 366 + }, + { + "epoch": 3.219298245614035, + "grad_norm": 0.447732537984848, + "learning_rate": 2.835202392419977e-06, + "loss": 0.1052, + "step": 367 + }, + { + "epoch": 3.2280701754385963, + "grad_norm": 0.6617346405982971, + "learning_rate": 2.8218704436314525e-06, + "loss": 0.1055, + "step": 368 + }, + { + "epoch": 3.236842105263158, + "grad_norm": 0.5550402402877808, + "learning_rate": 2.8085291804596995e-06, + "loss": 0.102, + "step": 369 + }, + { + "epoch": 3.245614035087719, + "grad_norm": 0.6046020984649658, + "learning_rate": 2.795178988978185e-06, + "loss": 0.1036, + "step": 370 + }, + { + "epoch": 3.254385964912281, + "grad_norm": 0.41890618205070496, + "learning_rate": 2.781820255518745e-06, + "loss": 0.1036, + "step": 371 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 0.8387415409088135, + "learning_rate": 2.768453366660408e-06, + "loss": 0.1076, + "step": 372 + }, + { + "epoch": 3.2719298245614037, + "grad_norm": 0.5318773984909058, + "learning_rate": 2.755078709218203e-06, + "loss": 0.1052, + "step": 373 + }, + { + "epoch": 3.280701754385965, + "grad_norm": 0.6617523431777954, + "learning_rate": 2.741696670231969e-06, + "loss": 0.1049, + "step": 374 + }, + { + "epoch": 3.2894736842105265, + "grad_norm": 1.0190025568008423, + "learning_rate": 2.728307636955156e-06, + "loss": 0.1034, + "step": 375 + }, + { + "epoch": 3.2982456140350878, + "grad_norm": 0.6924716234207153, + "learning_rate": 2.714911996843617e-06, + "loss": 0.1065, + "step": 376 + }, + { + "epoch": 3.307017543859649, + "grad_norm": 0.42501118779182434, + "learning_rate": 2.701510137544393e-06, + "loss": 0.1019, + "step": 377 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 0.844886064529419, + "learning_rate": 2.6881024468845e-06, + "loss": 0.1047, + "step": 378 + }, + { + "epoch": 3.324561403508772, + "grad_norm": 0.46512728929519653, + "learning_rate": 2.674689312859704e-06, + "loss": 0.1043, + "step": 379 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6242017149925232, + "learning_rate": 2.6612711236232915e-06, + "loss": 0.1046, + "step": 380 + }, + { + "epoch": 3.3421052631578947, + "grad_norm": 0.6578526496887207, + "learning_rate": 2.6478482674748375e-06, + "loss": 0.1031, + "step": 381 + }, + { + "epoch": 3.3508771929824563, + "grad_norm": 0.4822542667388916, + "learning_rate": 2.63442113284897e-06, + "loss": 0.1053, + "step": 382 + }, + { + "epoch": 3.3596491228070176, + "grad_norm": 0.48255595564842224, + "learning_rate": 2.6209901083041307e-06, + "loss": 0.1058, + "step": 383 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.6624025702476501, + "learning_rate": 2.6075555825113265e-06, + "loss": 0.1066, + "step": 384 + }, + { + "epoch": 3.3771929824561404, + "grad_norm": 0.6962618827819824, + "learning_rate": 2.5941179442428864e-06, + "loss": 0.102, + "step": 385 + }, + { + "epoch": 3.3859649122807016, + "grad_norm": 0.4976450502872467, + "learning_rate": 2.580677582361208e-06, + "loss": 0.1011, + "step": 386 + }, + { + "epoch": 3.3947368421052633, + "grad_norm": 0.5283737182617188, + "learning_rate": 2.5672348858075053e-06, + "loss": 0.1057, + "step": 387 + }, + { + "epoch": 3.4035087719298245, + "grad_norm": 0.32338738441467285, + "learning_rate": 2.553790243590556e-06, + "loss": 0.1015, + "step": 388 + }, + { + "epoch": 3.412280701754386, + "grad_norm": 0.7909435629844666, + "learning_rate": 2.5403440447754385e-06, + "loss": 0.1036, + "step": 389 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 0.6297115087509155, + "learning_rate": 2.5268966784722792e-06, + "loss": 0.1042, + "step": 390 + }, + { + "epoch": 3.4298245614035086, + "grad_norm": 0.32988762855529785, + "learning_rate": 2.513448533824988e-06, + "loss": 0.1059, + "step": 391 + }, + { + "epoch": 3.43859649122807, + "grad_norm": 0.9211220145225525, + "learning_rate": 2.5e-06, + "loss": 0.1015, + "step": 392 + }, + { + "epoch": 3.4473684210526314, + "grad_norm": 1.2157588005065918, + "learning_rate": 2.486551466175013e-06, + "loss": 0.1035, + "step": 393 + }, + { + "epoch": 3.456140350877193, + "grad_norm": 0.4786648452281952, + "learning_rate": 2.4731033215277216e-06, + "loss": 0.1026, + "step": 394 + }, + { + "epoch": 3.4649122807017543, + "grad_norm": 0.37398242950439453, + "learning_rate": 2.4596559552245623e-06, + "loss": 0.1044, + "step": 395 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 0.5536217093467712, + "learning_rate": 2.446209756409445e-06, + "loss": 0.1043, + "step": 396 + }, + { + "epoch": 3.482456140350877, + "grad_norm": 0.708406925201416, + "learning_rate": 2.432765114192495e-06, + "loss": 0.1046, + "step": 397 + }, + { + "epoch": 3.4912280701754383, + "grad_norm": 0.7140893340110779, + "learning_rate": 2.4193224176387926e-06, + "loss": 0.1039, + "step": 398 + }, + { + "epoch": 3.5, + "grad_norm": 0.8078088760375977, + "learning_rate": 2.4058820557571144e-06, + "loss": 0.1013, + "step": 399 + }, + { + "epoch": 3.5087719298245617, + "grad_norm": 0.7129591107368469, + "learning_rate": 2.3924444174886735e-06, + "loss": 0.1057, + "step": 400 + }, + { + "epoch": 3.517543859649123, + "grad_norm": 1.293412446975708, + "learning_rate": 2.37900989169587e-06, + "loss": 0.1081, + "step": 401 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 0.7235314249992371, + "learning_rate": 2.3655788671510314e-06, + "loss": 0.1054, + "step": 402 + }, + { + "epoch": 3.5350877192982457, + "grad_norm": 0.6008841395378113, + "learning_rate": 2.3521517325251637e-06, + "loss": 0.1033, + "step": 403 + }, + { + "epoch": 3.543859649122807, + "grad_norm": 0.6819609999656677, + "learning_rate": 2.3387288763767097e-06, + "loss": 0.1019, + "step": 404 + }, + { + "epoch": 3.5526315789473686, + "grad_norm": 0.5696406960487366, + "learning_rate": 2.325310687140296e-06, + "loss": 0.1043, + "step": 405 + }, + { + "epoch": 3.56140350877193, + "grad_norm": 0.8597077131271362, + "learning_rate": 2.3118975531155003e-06, + "loss": 0.1037, + "step": 406 + }, + { + "epoch": 3.5701754385964914, + "grad_norm": 0.43985217809677124, + "learning_rate": 2.2984898624556075e-06, + "loss": 0.105, + "step": 407 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.5448469519615173, + "learning_rate": 2.2850880031563845e-06, + "loss": 0.1037, + "step": 408 + }, + { + "epoch": 3.587719298245614, + "grad_norm": 0.8221977949142456, + "learning_rate": 2.271692363044845e-06, + "loss": 0.1015, + "step": 409 + }, + { + "epoch": 3.5964912280701755, + "grad_norm": 0.9838594198226929, + "learning_rate": 2.2583033297680316e-06, + "loss": 0.1085, + "step": 410 + }, + { + "epoch": 3.6052631578947367, + "grad_norm": 1.034848928451538, + "learning_rate": 2.2449212907817985e-06, + "loss": 0.104, + "step": 411 + }, + { + "epoch": 3.6140350877192984, + "grad_norm": 1.0788371562957764, + "learning_rate": 2.2315466333395927e-06, + "loss": 0.1033, + "step": 412 + }, + { + "epoch": 3.6228070175438596, + "grad_norm": 0.49096915125846863, + "learning_rate": 2.2181797444812557e-06, + "loss": 0.1044, + "step": 413 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 1.309685230255127, + "learning_rate": 2.204821011021815e-06, + "loss": 0.1036, + "step": 414 + }, + { + "epoch": 3.6403508771929824, + "grad_norm": 0.5014146566390991, + "learning_rate": 2.191470819540301e-06, + "loss": 0.104, + "step": 415 + }, + { + "epoch": 3.6491228070175437, + "grad_norm": 0.770470380783081, + "learning_rate": 2.178129556368548e-06, + "loss": 0.1049, + "step": 416 + }, + { + "epoch": 3.6578947368421053, + "grad_norm": 0.4639376699924469, + "learning_rate": 2.1647976075800235e-06, + "loss": 0.1047, + "step": 417 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 1.101885437965393, + "learning_rate": 2.151475358978652e-06, + "loss": 0.1035, + "step": 418 + }, + { + "epoch": 3.675438596491228, + "grad_norm": 0.5644329786300659, + "learning_rate": 2.138163196087648e-06, + "loss": 0.103, + "step": 419 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 1.1015008687973022, + "learning_rate": 2.1248615041383686e-06, + "loss": 0.1054, + "step": 420 + }, + { + "epoch": 3.692982456140351, + "grad_norm": 0.7311366200447083, + "learning_rate": 2.111570668059155e-06, + "loss": 0.1043, + "step": 421 + }, + { + "epoch": 3.7017543859649122, + "grad_norm": 0.38242173194885254, + "learning_rate": 2.098291072464199e-06, + "loss": 0.1041, + "step": 422 + }, + { + "epoch": 3.7105263157894735, + "grad_norm": 1.231512188911438, + "learning_rate": 2.085023101642412e-06, + "loss": 0.1021, + "step": 423 + }, + { + "epoch": 3.719298245614035, + "grad_norm": 0.41761213541030884, + "learning_rate": 2.0717671395463063e-06, + "loss": 0.1062, + "step": 424 + }, + { + "epoch": 3.7280701754385968, + "grad_norm": 0.4593309462070465, + "learning_rate": 2.0585235697808794e-06, + "loss": 0.1012, + "step": 425 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 0.9147135019302368, + "learning_rate": 2.0452927755925155e-06, + "loss": 0.1046, + "step": 426 + }, + { + "epoch": 3.745614035087719, + "grad_norm": 0.39639535546302795, + "learning_rate": 2.0320751398578984e-06, + "loss": 0.1018, + "step": 427 + }, + { + "epoch": 3.754385964912281, + "grad_norm": 0.688010573387146, + "learning_rate": 2.0188710450729255e-06, + "loss": 0.104, + "step": 428 + }, + { + "epoch": 3.763157894736842, + "grad_norm": 0.5140353441238403, + "learning_rate": 2.005680873341644e-06, + "loss": 0.1033, + "step": 429 + }, + { + "epoch": 3.7719298245614032, + "grad_norm": 0.5970481634140015, + "learning_rate": 1.992505006365191e-06, + "loss": 0.1044, + "step": 430 + }, + { + "epoch": 3.780701754385965, + "grad_norm": 0.551162838935852, + "learning_rate": 1.9793438254307496e-06, + "loss": 0.1042, + "step": 431 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.5344637632369995, + "learning_rate": 1.96619771140051e-06, + "loss": 0.1042, + "step": 432 + }, + { + "epoch": 3.7982456140350878, + "grad_norm": 0.5357667207717896, + "learning_rate": 1.9530670447006566e-06, + "loss": 0.101, + "step": 433 + }, + { + "epoch": 3.807017543859649, + "grad_norm": 1.2536660432815552, + "learning_rate": 1.9399522053103514e-06, + "loss": 0.1008, + "step": 434 + }, + { + "epoch": 3.8157894736842106, + "grad_norm": 0.4888289272785187, + "learning_rate": 1.926853572750741e-06, + "loss": 0.1028, + "step": 435 + }, + { + "epoch": 3.824561403508772, + "grad_norm": 0.5810404419898987, + "learning_rate": 1.913771526073976e-06, + "loss": 0.1031, + "step": 436 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.5372979044914246, + "learning_rate": 1.9007064438522374e-06, + "loss": 0.107, + "step": 437 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 0.8293616771697998, + "learning_rate": 1.8876587041667855e-06, + "loss": 0.1033, + "step": 438 + }, + { + "epoch": 3.8508771929824563, + "grad_norm": 2.361504554748535, + "learning_rate": 1.8746286845970145e-06, + "loss": 0.1098, + "step": 439 + }, + { + "epoch": 3.8596491228070176, + "grad_norm": 0.70230633020401, + "learning_rate": 1.8616167622095328e-06, + "loss": 0.1034, + "step": 440 + }, + { + "epoch": 3.8684210526315788, + "grad_norm": 0.6323564052581787, + "learning_rate": 1.8486233135472436e-06, + "loss": 0.1058, + "step": 441 + }, + { + "epoch": 3.8771929824561404, + "grad_norm": 0.48205408453941345, + "learning_rate": 1.8356487146184517e-06, + "loss": 0.105, + "step": 442 + }, + { + "epoch": 3.8859649122807016, + "grad_norm": 0.6996872425079346, + "learning_rate": 1.8226933408859864e-06, + "loss": 0.1083, + "step": 443 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 0.4114651679992676, + "learning_rate": 1.8097575672563278e-06, + "loss": 0.1003, + "step": 444 + }, + { + "epoch": 3.9035087719298245, + "grad_norm": 0.5234648585319519, + "learning_rate": 1.7968417680687666e-06, + "loss": 0.1019, + "step": 445 + }, + { + "epoch": 3.912280701754386, + "grad_norm": 1.0571491718292236, + "learning_rate": 1.7839463170845641e-06, + "loss": 0.1003, + "step": 446 + }, + { + "epoch": 3.9210526315789473, + "grad_norm": 0.7470094561576843, + "learning_rate": 1.7710715874761408e-06, + "loss": 0.1061, + "step": 447 + }, + { + "epoch": 3.9298245614035086, + "grad_norm": 0.901695191860199, + "learning_rate": 1.7582179518162742e-06, + "loss": 0.1015, + "step": 448 + }, + { + "epoch": 3.93859649122807, + "grad_norm": 1.0251179933547974, + "learning_rate": 1.7453857820673215e-06, + "loss": 0.1, + "step": 449 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 0.5065406560897827, + "learning_rate": 1.7325754495704508e-06, + "loss": 0.1036, + "step": 450 + }, + { + "epoch": 3.956140350877193, + "grad_norm": 0.9541155099868774, + "learning_rate": 1.7197873250348962e-06, + "loss": 0.1015, + "step": 451 + }, + { + "epoch": 3.9649122807017543, + "grad_norm": 0.6264199018478394, + "learning_rate": 1.7070217785272354e-06, + "loss": 0.1026, + "step": 452 + }, + { + "epoch": 3.973684210526316, + "grad_norm": 0.6260526180267334, + "learning_rate": 1.6942791794606716e-06, + "loss": 0.1039, + "step": 453 + }, + { + "epoch": 3.982456140350877, + "grad_norm": 0.4730931222438812, + "learning_rate": 1.681559896584352e-06, + "loss": 0.1045, + "step": 454 + }, + { + "epoch": 3.9912280701754383, + "grad_norm": 0.5011451840400696, + "learning_rate": 1.668864297972689e-06, + "loss": 0.1062, + "step": 455 + }, + { + "epoch": 4.0, + "grad_norm": 1.0113046169281006, + "learning_rate": 1.6561927510147172e-06, + "loss": 0.1005, + "step": 456 + } + ], + "logging_steps": 1, + "max_steps": 684, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 114, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.383996029659408e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-456/training_args.bin b/checkpoint-456/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..38c27bdabb0e0e68242bce9d9302628a34f6e7cf --- /dev/null +++ b/checkpoint-456/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cb0553c2c3dd5a010aed55eae3afd8bd7f096b43ba03d25af54dc26191426ae +size 7992 diff --git a/checkpoint-456/zero_to_fp32.py b/checkpoint-456/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-456/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-570/README.md b/checkpoint-570/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f4a3934800eeb082a0cb833d7b6af4f68eed3615 --- /dev/null +++ b/checkpoint-570/README.md @@ -0,0 +1,202 @@ +--- +base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-570/adapter_config.json b/checkpoint-570/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be --- /dev/null +++ b/checkpoint-570/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-570/adapter_model.safetensors b/checkpoint-570/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..03e5d913f0cabdef46f28c2746e18c64694aa920 --- /dev/null +++ b/checkpoint-570/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4bfd3d5f12af03a754a0ee43e020ca5f08d1d2241ff456cfef469e34cf6f2aa +size 9016826528 diff --git a/checkpoint-570/global_step570/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-570/global_step570/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10f08c4495d0763d3292461a287ae16760fbfa34 --- /dev/null +++ b/checkpoint-570/global_step570/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10bc23db9233737e64aa4c5a8bb1fe4760aa94691ce7df815838960f65abe9d2 +size 27050164444 diff --git a/checkpoint-570/global_step570/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-570/global_step570/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3d4f593c55f517f9992f81fdf4bb920c6280e62 --- /dev/null +++ b/checkpoint-570/global_step570/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:170708ac3a01d1c0140c7d33c17f2710b074d310e1b3c5468e4c011a7ea23e30 +size 27050169884 diff --git a/checkpoint-570/global_step570/mp_rank_00_model_states.pt b/checkpoint-570/global_step570/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4af4c9a635017c52f7532cb7e03e38f4260c14b5 --- /dev/null +++ b/checkpoint-570/global_step570/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38a54520a329f55ae70fee2fc9270649647c866ec0d5a2e9bed66c2950369245 +size 9776788601 diff --git a/checkpoint-570/latest b/checkpoint-570/latest new file mode 100644 index 0000000000000000000000000000000000000000..0433d1c81a4b69bdd8533de1f0573850078819c8 --- /dev/null +++ b/checkpoint-570/latest @@ -0,0 +1 @@ +global_step570 \ No newline at end of file diff --git a/checkpoint-570/rng_state_0.pth b/checkpoint-570/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..cadea5c4497e157de18771025fb48dd7a47bdfb2 --- /dev/null +++ b/checkpoint-570/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ef06f6fc50741e0a072d30f8d6ef66788bbe7cb3d11d5f3592a9eec58dcbdd1 +size 14512 diff --git a/checkpoint-570/rng_state_1.pth b/checkpoint-570/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..521c7cd5a942c2b3d731a0df2302940e8e1baf65 --- /dev/null +++ b/checkpoint-570/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b9f4f89dbf7eb3015045d850b8e4485292b7d21154769139ee2c636add2ea3 +size 14512 diff --git a/checkpoint-570/scheduler.pt b/checkpoint-570/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0357e35344f89f2f24bb0d414d635df04fbbd556 --- /dev/null +++ b/checkpoint-570/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:865e282c4f805f5c50f6c4d4aa455e69a7386950da590f7fd7b70db9aef5414c +size 1064 diff --git a/checkpoint-570/special_tokens_map.json b/checkpoint-570/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-570/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-570/tokenizer.json b/checkpoint-570/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-570/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-570/tokenizer_config.json b/checkpoint-570/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74 --- /dev/null +++ b/checkpoint-570/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-570/trainer_state.json b/checkpoint-570/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e2a219421f43a17316af934fac0e081e4a99d61e --- /dev/null +++ b/checkpoint-570/trainer_state.json @@ -0,0 +1,4023 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 570, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008771929824561403, + "grad_norm": 39.56407165527344, + "learning_rate": 5.0000000000000004e-08, + "loss": 5.1375, + "step": 1 + }, + { + "epoch": 0.017543859649122806, + "grad_norm": 40.30452346801758, + "learning_rate": 1.0000000000000001e-07, + "loss": 5.1185, + "step": 2 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 40.062313079833984, + "learning_rate": 1.5000000000000002e-07, + "loss": 5.0762, + "step": 3 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 39.17148208618164, + "learning_rate": 2.0000000000000002e-07, + "loss": 5.016, + "step": 4 + }, + { + "epoch": 0.043859649122807015, + "grad_norm": 40.67367172241211, + "learning_rate": 2.5000000000000004e-07, + "loss": 5.0428, + "step": 5 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 38.18095016479492, + "learning_rate": 3.0000000000000004e-07, + "loss": 5.2025, + "step": 6 + }, + { + "epoch": 0.06140350877192982, + "grad_norm": 39.12940979003906, + "learning_rate": 3.5000000000000004e-07, + "loss": 4.9896, + "step": 7 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 38.84568405151367, + "learning_rate": 4.0000000000000003e-07, + "loss": 5.1078, + "step": 8 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 39.38333511352539, + "learning_rate": 4.5000000000000003e-07, + "loss": 5.0808, + "step": 9 + }, + { + "epoch": 0.08771929824561403, + "grad_norm": 39.427650451660156, + "learning_rate": 5.000000000000001e-07, + "loss": 5.0534, + "step": 10 + }, + { + "epoch": 0.09649122807017543, + "grad_norm": 39.29513168334961, + "learning_rate": 5.5e-07, + "loss": 5.058, + "step": 11 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 39.641231536865234, + "learning_rate": 6.000000000000001e-07, + "loss": 5.0317, + "step": 12 + }, + { + "epoch": 0.11403508771929824, + "grad_norm": 37.91259765625, + "learning_rate": 6.5e-07, + "loss": 4.912, + "step": 13 + }, + { + "epoch": 0.12280701754385964, + "grad_norm": 38.203548431396484, + "learning_rate": 7.000000000000001e-07, + "loss": 4.9705, + "step": 14 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 39.15998840332031, + "learning_rate": 7.5e-07, + "loss": 4.6962, + "step": 15 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 37.754669189453125, + "learning_rate": 8.000000000000001e-07, + "loss": 4.6262, + "step": 16 + }, + { + "epoch": 0.14912280701754385, + "grad_norm": 35.871490478515625, + "learning_rate": 8.500000000000001e-07, + "loss": 4.5422, + "step": 17 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 36.16888427734375, + "learning_rate": 9.000000000000001e-07, + "loss": 4.664, + "step": 18 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 33.520118713378906, + "learning_rate": 9.500000000000001e-07, + "loss": 4.4697, + "step": 19 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 30.896282196044922, + "learning_rate": 1.0000000000000002e-06, + "loss": 4.3568, + "step": 20 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 29.944643020629883, + "learning_rate": 1.0500000000000001e-06, + "loss": 4.2269, + "step": 21 + }, + { + "epoch": 0.19298245614035087, + "grad_norm": 25.224485397338867, + "learning_rate": 1.1e-06, + "loss": 4.1272, + "step": 22 + }, + { + "epoch": 0.20175438596491227, + "grad_norm": 24.410480499267578, + "learning_rate": 1.1500000000000002e-06, + "loss": 4.0585, + "step": 23 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 21.480648040771484, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.9472, + "step": 24 + }, + { + "epoch": 0.21929824561403508, + "grad_norm": 20.61946678161621, + "learning_rate": 1.25e-06, + "loss": 3.8879, + "step": 25 + }, + { + "epoch": 0.22807017543859648, + "grad_norm": 19.578271865844727, + "learning_rate": 1.3e-06, + "loss": 3.6783, + "step": 26 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 17.418983459472656, + "learning_rate": 1.3500000000000002e-06, + "loss": 3.6826, + "step": 27 + }, + { + "epoch": 0.24561403508771928, + "grad_norm": 18.160301208496094, + "learning_rate": 1.4000000000000001e-06, + "loss": 3.478, + "step": 28 + }, + { + "epoch": 0.2543859649122807, + "grad_norm": 17.573204040527344, + "learning_rate": 1.45e-06, + "loss": 3.459, + "step": 29 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 17.1265869140625, + "learning_rate": 1.5e-06, + "loss": 3.3999, + "step": 30 + }, + { + "epoch": 0.2719298245614035, + "grad_norm": 15.527145385742188, + "learning_rate": 1.5500000000000002e-06, + "loss": 3.2817, + "step": 31 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 14.773847579956055, + "learning_rate": 1.6000000000000001e-06, + "loss": 3.234, + "step": 32 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 12.039301872253418, + "learning_rate": 1.6500000000000003e-06, + "loss": 3.132, + "step": 33 + }, + { + "epoch": 0.2982456140350877, + "grad_norm": 9.217979431152344, + "learning_rate": 1.7000000000000002e-06, + "loss": 3.0548, + "step": 34 + }, + { + "epoch": 0.30701754385964913, + "grad_norm": 7.575639724731445, + "learning_rate": 1.75e-06, + "loss": 2.9529, + "step": 35 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 7.496004104614258, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.8967, + "step": 36 + }, + { + "epoch": 0.32456140350877194, + "grad_norm": 7.45414924621582, + "learning_rate": 1.85e-06, + "loss": 2.8837, + "step": 37 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 8.555658340454102, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.7473, + "step": 38 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 10.03805160522461, + "learning_rate": 1.9500000000000004e-06, + "loss": 2.7355, + "step": 39 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 9.30649471282959, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.6587, + "step": 40 + }, + { + "epoch": 0.35964912280701755, + "grad_norm": 8.510339736938477, + "learning_rate": 2.05e-06, + "loss": 2.5977, + "step": 41 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 4.709080696105957, + "learning_rate": 2.1000000000000002e-06, + "loss": 2.6286, + "step": 42 + }, + { + "epoch": 0.37719298245614036, + "grad_norm": 5.128961086273193, + "learning_rate": 2.15e-06, + "loss": 2.4558, + "step": 43 + }, + { + "epoch": 0.38596491228070173, + "grad_norm": 5.190136432647705, + "learning_rate": 2.2e-06, + "loss": 2.4432, + "step": 44 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 4.893551349639893, + "learning_rate": 2.25e-06, + "loss": 2.4939, + "step": 45 + }, + { + "epoch": 0.40350877192982454, + "grad_norm": 5.2434983253479, + "learning_rate": 2.3000000000000004e-06, + "loss": 2.3381, + "step": 46 + }, + { + "epoch": 0.41228070175438597, + "grad_norm": 5.122412204742432, + "learning_rate": 2.35e-06, + "loss": 2.313, + "step": 47 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 4.577274799346924, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.2236, + "step": 48 + }, + { + "epoch": 0.4298245614035088, + "grad_norm": 4.722769737243652, + "learning_rate": 2.4500000000000003e-06, + "loss": 2.1987, + "step": 49 + }, + { + "epoch": 0.43859649122807015, + "grad_norm": 5.059235095977783, + "learning_rate": 2.5e-06, + "loss": 2.1415, + "step": 50 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 4.454439640045166, + "learning_rate": 2.55e-06, + "loss": 2.0466, + "step": 51 + }, + { + "epoch": 0.45614035087719296, + "grad_norm": 4.94586706161499, + "learning_rate": 2.6e-06, + "loss": 1.8762, + "step": 52 + }, + { + "epoch": 0.4649122807017544, + "grad_norm": 4.704402446746826, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.8012, + "step": 53 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 6.125903129577637, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.7669, + "step": 54 + }, + { + "epoch": 0.4824561403508772, + "grad_norm": 4.5356059074401855, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.6607, + "step": 55 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 6.56803035736084, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.6291, + "step": 56 + }, + { + "epoch": 0.5, + "grad_norm": 4.910050392150879, + "learning_rate": 2.85e-06, + "loss": 1.5545, + "step": 57 + }, + { + "epoch": 0.5087719298245614, + "grad_norm": 8.733433723449707, + "learning_rate": 2.9e-06, + "loss": 1.4206, + "step": 58 + }, + { + "epoch": 0.5175438596491229, + "grad_norm": 8.582486152648926, + "learning_rate": 2.95e-06, + "loss": 1.3912, + "step": 59 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 13.710689544677734, + "learning_rate": 3e-06, + "loss": 1.3297, + "step": 60 + }, + { + "epoch": 0.5350877192982456, + "grad_norm": 23.400312423706055, + "learning_rate": 3.05e-06, + "loss": 1.296, + "step": 61 + }, + { + "epoch": 0.543859649122807, + "grad_norm": 5.678805351257324, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.2259, + "step": 62 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 14.700899124145508, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.1087, + "step": 63 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 19.38919448852539, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.1805, + "step": 64 + }, + { + "epoch": 0.5701754385964912, + "grad_norm": 8.460039138793945, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.0963, + "step": 65 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 13.371014595031738, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.0627, + "step": 66 + }, + { + "epoch": 0.5877192982456141, + "grad_norm": 22.380569458007812, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0869, + "step": 67 + }, + { + "epoch": 0.5964912280701754, + "grad_norm": 5.780513286590576, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9991, + "step": 68 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 19.850841522216797, + "learning_rate": 3.45e-06, + "loss": 0.9683, + "step": 69 + }, + { + "epoch": 0.6140350877192983, + "grad_norm": 17.160703659057617, + "learning_rate": 3.5e-06, + "loss": 0.845, + "step": 70 + }, + { + "epoch": 0.6228070175438597, + "grad_norm": 14.264311790466309, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8059, + "step": 71 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 26.39459991455078, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.85, + "step": 72 + }, + { + "epoch": 0.6403508771929824, + "grad_norm": 51.10348892211914, + "learning_rate": 3.65e-06, + "loss": 0.9755, + "step": 73 + }, + { + "epoch": 0.6491228070175439, + "grad_norm": 28.795856475830078, + "learning_rate": 3.7e-06, + "loss": 0.8966, + "step": 74 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 4.6617937088012695, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7716, + "step": 75 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 15.729666709899902, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.7578, + "step": 76 + }, + { + "epoch": 0.6754385964912281, + "grad_norm": 7.109970569610596, + "learning_rate": 3.85e-06, + "loss": 0.7055, + "step": 77 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 20.84659194946289, + "learning_rate": 3.900000000000001e-06, + "loss": 0.7458, + "step": 78 + }, + { + "epoch": 0.6929824561403509, + "grad_norm": 21.601303100585938, + "learning_rate": 3.95e-06, + "loss": 0.6879, + "step": 79 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 3.6914751529693604, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6179, + "step": 80 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 16.539325714111328, + "learning_rate": 4.05e-06, + "loss": 0.5716, + "step": 81 + }, + { + "epoch": 0.7192982456140351, + "grad_norm": 13.931925773620605, + "learning_rate": 4.1e-06, + "loss": 0.558, + "step": 82 + }, + { + "epoch": 0.7280701754385965, + "grad_norm": 10.52951717376709, + "learning_rate": 4.15e-06, + "loss": 0.6018, + "step": 83 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 17.337060928344727, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.5501, + "step": 84 + }, + { + "epoch": 0.7456140350877193, + "grad_norm": 13.500468254089355, + "learning_rate": 4.25e-06, + "loss": 0.5214, + "step": 85 + }, + { + "epoch": 0.7543859649122807, + "grad_norm": 10.290645599365234, + "learning_rate": 4.3e-06, + "loss": 0.4996, + "step": 86 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 9.757556915283203, + "learning_rate": 4.350000000000001e-06, + "loss": 0.498, + "step": 87 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 9.325140953063965, + "learning_rate": 4.4e-06, + "loss": 0.4721, + "step": 88 + }, + { + "epoch": 0.7807017543859649, + "grad_norm": 2.9322128295898438, + "learning_rate": 4.450000000000001e-06, + "loss": 0.4528, + "step": 89 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 10.484073638916016, + "learning_rate": 4.5e-06, + "loss": 0.445, + "step": 90 + }, + { + "epoch": 0.7982456140350878, + "grad_norm": 32.7827262878418, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.5105, + "step": 91 + }, + { + "epoch": 0.8070175438596491, + "grad_norm": 2.8477306365966797, + "learning_rate": 4.600000000000001e-06, + "loss": 0.4117, + "step": 92 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 2.7680225372314453, + "learning_rate": 4.65e-06, + "loss": 0.3653, + "step": 93 + }, + { + "epoch": 0.8245614035087719, + "grad_norm": 2.6512742042541504, + "learning_rate": 4.7e-06, + "loss": 0.3878, + "step": 94 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 6.453914165496826, + "learning_rate": 4.75e-06, + "loss": 0.3611, + "step": 95 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 3.4594080448150635, + "learning_rate": 4.800000000000001e-06, + "loss": 0.3817, + "step": 96 + }, + { + "epoch": 0.8508771929824561, + "grad_norm": 3.6144917011260986, + "learning_rate": 4.85e-06, + "loss": 0.3618, + "step": 97 + }, + { + "epoch": 0.8596491228070176, + "grad_norm": 5.349407196044922, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.3218, + "step": 98 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 13.671236991882324, + "learning_rate": 4.95e-06, + "loss": 0.3329, + "step": 99 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 5.84046745300293, + "learning_rate": 5e-06, + "loss": 0.2967, + "step": 100 + }, + { + "epoch": 0.8859649122807017, + "grad_norm": 14.005338668823242, + "learning_rate": 4.999963827125897e-06, + "loss": 0.303, + "step": 101 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 9.18114185333252, + "learning_rate": 4.999855309550366e-06, + "loss": 0.2762, + "step": 102 + }, + { + "epoch": 0.9035087719298246, + "grad_norm": 3.0800487995147705, + "learning_rate": 4.999674450413725e-06, + "loss": 0.2628, + "step": 103 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 82.03578186035156, + "learning_rate": 4.999421254949728e-06, + "loss": 0.4065, + "step": 104 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 77.66315460205078, + "learning_rate": 4.99909573048542e-06, + "loss": 0.4307, + "step": 105 + }, + { + "epoch": 0.9298245614035088, + "grad_norm": 18.28767967224121, + "learning_rate": 4.998697886440927e-06, + "loss": 0.2571, + "step": 106 + }, + { + "epoch": 0.9385964912280702, + "grad_norm": 5.960445880889893, + "learning_rate": 4.998227734329177e-06, + "loss": 0.2847, + "step": 107 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 5.437699794769287, + "learning_rate": 4.9976852877555755e-06, + "loss": 0.2728, + "step": 108 + }, + { + "epoch": 0.956140350877193, + "grad_norm": 3.379631280899048, + "learning_rate": 4.997070562417602e-06, + "loss": 0.2467, + "step": 109 + }, + { + "epoch": 0.9649122807017544, + "grad_norm": 3.1625075340270996, + "learning_rate": 4.996383576104362e-06, + "loss": 0.2273, + "step": 110 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 15.588600158691406, + "learning_rate": 4.995624348696071e-06, + "loss": 0.2486, + "step": 111 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 2.631044387817383, + "learning_rate": 4.9947929021634815e-06, + "loss": 0.1964, + "step": 112 + }, + { + "epoch": 0.9912280701754386, + "grad_norm": 4.706504821777344, + "learning_rate": 4.993889260567239e-06, + "loss": 0.1901, + "step": 113 + }, + { + "epoch": 1.0, + "grad_norm": 10.368465423583984, + "learning_rate": 4.9929134500571954e-06, + "loss": 0.1996, + "step": 114 + }, + { + "epoch": 1.0087719298245614, + "grad_norm": 30.44986343383789, + "learning_rate": 4.991865498871647e-06, + "loss": 0.2606, + "step": 115 + }, + { + "epoch": 1.0175438596491229, + "grad_norm": 14.421515464782715, + "learning_rate": 4.99074543733652e-06, + "loss": 0.2394, + "step": 116 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 14.072005271911621, + "learning_rate": 4.989553297864489e-06, + "loss": 0.2288, + "step": 117 + }, + { + "epoch": 1.0350877192982457, + "grad_norm": 4.395325660705566, + "learning_rate": 4.988289114954045e-06, + "loss": 0.2129, + "step": 118 + }, + { + "epoch": 1.043859649122807, + "grad_norm": 7.286703586578369, + "learning_rate": 4.986952925188489e-06, + "loss": 0.186, + "step": 119 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 8.332784652709961, + "learning_rate": 4.98554476723488e-06, + "loss": 0.178, + "step": 120 + }, + { + "epoch": 1.0614035087719298, + "grad_norm": 1.3646447658538818, + "learning_rate": 4.984064681842917e-06, + "loss": 0.1687, + "step": 121 + }, + { + "epoch": 1.0701754385964912, + "grad_norm": 4.494940757751465, + "learning_rate": 4.982512711843753e-06, + "loss": 0.1881, + "step": 122 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 3.3929836750030518, + "learning_rate": 4.980888902148757e-06, + "loss": 0.1764, + "step": 123 + }, + { + "epoch": 1.087719298245614, + "grad_norm": 1.8281155824661255, + "learning_rate": 4.979193299748225e-06, + "loss": 0.1602, + "step": 124 + }, + { + "epoch": 1.0964912280701755, + "grad_norm": 3.494239568710327, + "learning_rate": 4.977425953710005e-06, + "loss": 0.1729, + "step": 125 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 1.500410556793213, + "learning_rate": 4.975586915178084e-06, + "loss": 0.1666, + "step": 126 + }, + { + "epoch": 1.1140350877192982, + "grad_norm": 1.4680222272872925, + "learning_rate": 4.973676237371111e-06, + "loss": 0.159, + "step": 127 + }, + { + "epoch": 1.1228070175438596, + "grad_norm": 3.0383460521698, + "learning_rate": 4.971693975580851e-06, + "loss": 0.1484, + "step": 128 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 3.74821138381958, + "learning_rate": 4.969640187170591e-06, + "loss": 0.1586, + "step": 129 + }, + { + "epoch": 1.1403508771929824, + "grad_norm": 4.682602405548096, + "learning_rate": 4.967514931573473e-06, + "loss": 0.1619, + "step": 130 + }, + { + "epoch": 1.1491228070175439, + "grad_norm": 3.90673565864563, + "learning_rate": 4.965318270290779e-06, + "loss": 0.164, + "step": 131 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 2.2017388343811035, + "learning_rate": 4.963050266890152e-06, + "loss": 0.1499, + "step": 132 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 2.4211816787719727, + "learning_rate": 4.960710987003753e-06, + "loss": 0.1387, + "step": 133 + }, + { + "epoch": 1.1754385964912282, + "grad_norm": 1.7753759622573853, + "learning_rate": 4.958300498326363e-06, + "loss": 0.1441, + "step": 134 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 1.5529910326004028, + "learning_rate": 4.955818870613425e-06, + "loss": 0.1304, + "step": 135 + }, + { + "epoch": 1.1929824561403508, + "grad_norm": 2.090593099594116, + "learning_rate": 4.953266175679023e-06, + "loss": 0.1419, + "step": 136 + }, + { + "epoch": 1.2017543859649122, + "grad_norm": 2.7141878604888916, + "learning_rate": 4.95064248739381e-06, + "loss": 0.1444, + "step": 137 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 2.3690481185913086, + "learning_rate": 4.947947881682861e-06, + "loss": 0.1383, + "step": 138 + }, + { + "epoch": 1.219298245614035, + "grad_norm": 2.2403147220611572, + "learning_rate": 4.945182436523482e-06, + "loss": 0.1418, + "step": 139 + }, + { + "epoch": 1.2280701754385965, + "grad_norm": 1.3939160108566284, + "learning_rate": 4.942346231942955e-06, + "loss": 0.1307, + "step": 140 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 11.276732444763184, + "learning_rate": 4.939439350016214e-06, + "loss": 0.1397, + "step": 141 + }, + { + "epoch": 1.2456140350877192, + "grad_norm": 8.260516166687012, + "learning_rate": 4.9364618748634794e-06, + "loss": 0.1426, + "step": 142 + }, + { + "epoch": 1.2543859649122808, + "grad_norm": 2.09720516204834, + "learning_rate": 4.933413892647819e-06, + "loss": 0.1323, + "step": 143 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.802125334739685, + "learning_rate": 4.9302954915726535e-06, + "loss": 0.1304, + "step": 144 + }, + { + "epoch": 1.2719298245614035, + "grad_norm": 1.7151471376419067, + "learning_rate": 4.927106761879207e-06, + "loss": 0.1264, + "step": 145 + }, + { + "epoch": 1.280701754385965, + "grad_norm": 1.6970336437225342, + "learning_rate": 4.923847795843894e-06, + "loss": 0.1227, + "step": 146 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 16.60441017150879, + "learning_rate": 4.920518687775647e-06, + "loss": 0.1606, + "step": 147 + }, + { + "epoch": 1.2982456140350878, + "grad_norm": 6.470354080200195, + "learning_rate": 4.917119534013194e-06, + "loss": 0.1447, + "step": 148 + }, + { + "epoch": 1.3070175438596492, + "grad_norm": 1.4908231496810913, + "learning_rate": 4.913650432922264e-06, + "loss": 0.1343, + "step": 149 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 3.19964861869812, + "learning_rate": 4.91011148489274e-06, + "loss": 0.1354, + "step": 150 + }, + { + "epoch": 1.3245614035087718, + "grad_norm": 2.6052839756011963, + "learning_rate": 4.906502792335761e-06, + "loss": 0.1342, + "step": 151 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.0719165802001953, + "learning_rate": 4.9028244596807525e-06, + "loss": 0.1359, + "step": 152 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.8086919784545898, + "learning_rate": 4.899076593372405e-06, + "loss": 0.1279, + "step": 153 + }, + { + "epoch": 1.3508771929824561, + "grad_norm": 1.0056848526000977, + "learning_rate": 4.8952593018675955e-06, + "loss": 0.1162, + "step": 154 + }, + { + "epoch": 1.3596491228070176, + "grad_norm": 5.72553014755249, + "learning_rate": 4.891372695632249e-06, + "loss": 0.1315, + "step": 155 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.522894024848938, + "learning_rate": 4.887416887138139e-06, + "loss": 0.1266, + "step": 156 + }, + { + "epoch": 1.3771929824561404, + "grad_norm": 2.019472122192383, + "learning_rate": 4.883391990859635e-06, + "loss": 0.1262, + "step": 157 + }, + { + "epoch": 1.3859649122807016, + "grad_norm": 1.8594422340393066, + "learning_rate": 4.879298123270391e-06, + "loss": 0.125, + "step": 158 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 1.365377426147461, + "learning_rate": 4.8751354028399725e-06, + "loss": 0.1218, + "step": 159 + }, + { + "epoch": 1.4035087719298245, + "grad_norm": 3.553309917449951, + "learning_rate": 4.870903950030429e-06, + "loss": 0.1272, + "step": 160 + }, + { + "epoch": 1.412280701754386, + "grad_norm": 2.1770920753479004, + "learning_rate": 4.866603887292809e-06, + "loss": 0.1213, + "step": 161 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 1.6058955192565918, + "learning_rate": 4.862235339063613e-06, + "loss": 0.1173, + "step": 162 + }, + { + "epoch": 1.4298245614035088, + "grad_norm": 1.3208314180374146, + "learning_rate": 4.857798431761199e-06, + "loss": 0.1183, + "step": 163 + }, + { + "epoch": 1.4385964912280702, + "grad_norm": 1.282729983329773, + "learning_rate": 4.853293293782118e-06, + "loss": 0.1209, + "step": 164 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 1.3838152885437012, + "learning_rate": 4.848720055497401e-06, + "loss": 0.1198, + "step": 165 + }, + { + "epoch": 1.456140350877193, + "grad_norm": 1.2930737733840942, + "learning_rate": 4.844078849248785e-06, + "loss": 0.1268, + "step": 166 + }, + { + "epoch": 1.4649122807017543, + "grad_norm": 1.7022266387939453, + "learning_rate": 4.839369809344888e-06, + "loss": 0.1198, + "step": 167 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.0927815437316895, + "learning_rate": 4.834593072057313e-06, + "loss": 0.1132, + "step": 168 + }, + { + "epoch": 1.4824561403508771, + "grad_norm": 0.9326333999633789, + "learning_rate": 4.829748775616716e-06, + "loss": 0.1193, + "step": 169 + }, + { + "epoch": 1.4912280701754386, + "grad_norm": 1.3564742803573608, + "learning_rate": 4.8248370602087954e-06, + "loss": 0.118, + "step": 170 + }, + { + "epoch": 1.5, + "grad_norm": 1.19778573513031, + "learning_rate": 4.819858067970243e-06, + "loss": 0.1122, + "step": 171 + }, + { + "epoch": 1.5087719298245614, + "grad_norm": 2.8438351154327393, + "learning_rate": 4.814811942984625e-06, + "loss": 0.1217, + "step": 172 + }, + { + "epoch": 1.5175438596491229, + "grad_norm": 1.0701063871383667, + "learning_rate": 4.809698831278217e-06, + "loss": 0.1114, + "step": 173 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.9053553938865662, + "learning_rate": 4.804518880815776e-06, + "loss": 0.1178, + "step": 174 + }, + { + "epoch": 1.5350877192982457, + "grad_norm": 0.42274603247642517, + "learning_rate": 4.799272241496259e-06, + "loss": 0.1091, + "step": 175 + }, + { + "epoch": 1.543859649122807, + "grad_norm": 0.8576470017433167, + "learning_rate": 4.793959065148484e-06, + "loss": 0.1134, + "step": 176 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.5910662412643433, + "learning_rate": 4.78857950552674e-06, + "loss": 0.1148, + "step": 177 + }, + { + "epoch": 1.5614035087719298, + "grad_norm": 0.8761632442474365, + "learning_rate": 4.783133718306331e-06, + "loss": 0.1125, + "step": 178 + }, + { + "epoch": 1.5701754385964912, + "grad_norm": 1.9190795421600342, + "learning_rate": 4.777621861079079e-06, + "loss": 0.1148, + "step": 179 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.6199957728385925, + "learning_rate": 4.772044093348757e-06, + "loss": 0.1097, + "step": 180 + }, + { + "epoch": 1.587719298245614, + "grad_norm": 1.562089443206787, + "learning_rate": 4.766400576526479e-06, + "loss": 0.1097, + "step": 181 + }, + { + "epoch": 1.5964912280701755, + "grad_norm": 1.4957091808319092, + "learning_rate": 4.760691473926021e-06, + "loss": 0.1216, + "step": 182 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.9863570332527161, + "learning_rate": 4.754916950759105e-06, + "loss": 0.1122, + "step": 183 + }, + { + "epoch": 1.6140350877192984, + "grad_norm": 0.5803346633911133, + "learning_rate": 4.749077174130609e-06, + "loss": 0.1103, + "step": 184 + }, + { + "epoch": 1.6228070175438596, + "grad_norm": 1.8789891004562378, + "learning_rate": 4.743172313033738e-06, + "loss": 0.1191, + "step": 185 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.8731380105018616, + "learning_rate": 4.7372025383451285e-06, + "loss": 0.1154, + "step": 186 + }, + { + "epoch": 1.6403508771929824, + "grad_norm": 1.3535627126693726, + "learning_rate": 4.7311680228199075e-06, + "loss": 0.1123, + "step": 187 + }, + { + "epoch": 1.6491228070175439, + "grad_norm": 0.7211089134216309, + "learning_rate": 4.725068941086693e-06, + "loss": 0.1134, + "step": 188 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 1.4752328395843506, + "learning_rate": 4.718905469642534e-06, + "loss": 0.1185, + "step": 189 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.9822680354118347, + "learning_rate": 4.712677786847814e-06, + "loss": 0.1146, + "step": 190 + }, + { + "epoch": 1.6754385964912282, + "grad_norm": 1.1308330297470093, + "learning_rate": 4.706386072921083e-06, + "loss": 0.1061, + "step": 191 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 5.331939697265625, + "learning_rate": 4.70003050993384e-06, + "loss": 0.1153, + "step": 192 + }, + { + "epoch": 1.692982456140351, + "grad_norm": 0.6911673545837402, + "learning_rate": 4.6936112818052674e-06, + "loss": 0.1098, + "step": 193 + }, + { + "epoch": 1.7017543859649122, + "grad_norm": 0.5160980224609375, + "learning_rate": 4.687128574296912e-06, + "loss": 0.1073, + "step": 194 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 1.5724798440933228, + "learning_rate": 4.680582575007303e-06, + "loss": 0.121, + "step": 195 + }, + { + "epoch": 1.719298245614035, + "grad_norm": 1.3960011005401611, + "learning_rate": 4.6739734733665275e-06, + "loss": 0.1145, + "step": 196 + }, + { + "epoch": 1.7280701754385965, + "grad_norm": 1.4949183464050293, + "learning_rate": 4.6673014606307465e-06, + "loss": 0.1166, + "step": 197 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 1.6873422861099243, + "learning_rate": 4.660566729876661e-06, + "loss": 0.1115, + "step": 198 + }, + { + "epoch": 1.7456140350877192, + "grad_norm": 1.3443641662597656, + "learning_rate": 4.653769475995926e-06, + "loss": 0.1119, + "step": 199 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 0.807525098323822, + "learning_rate": 4.646909895689508e-06, + "loss": 0.1059, + "step": 200 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 1.589316964149475, + "learning_rate": 4.639988187461995e-06, + "loss": 0.1151, + "step": 201 + }, + { + "epoch": 1.7719298245614035, + "grad_norm": 2.474756956100464, + "learning_rate": 4.633004551615851e-06, + "loss": 0.116, + "step": 202 + }, + { + "epoch": 1.780701754385965, + "grad_norm": 0.6210195422172546, + "learning_rate": 4.62595919024562e-06, + "loss": 0.1097, + "step": 203 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.7217905521392822, + "learning_rate": 4.618852307232078e-06, + "loss": 0.1117, + "step": 204 + }, + { + "epoch": 1.7982456140350878, + "grad_norm": 1.551251769065857, + "learning_rate": 4.611684108236334e-06, + "loss": 0.113, + "step": 205 + }, + { + "epoch": 1.807017543859649, + "grad_norm": 0.6619828939437866, + "learning_rate": 4.604454800693874e-06, + "loss": 0.113, + "step": 206 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.9461805820465088, + "learning_rate": 4.597164593808564e-06, + "loss": 0.1093, + "step": 207 + }, + { + "epoch": 1.8245614035087718, + "grad_norm": 1.2926547527313232, + "learning_rate": 4.589813698546592e-06, + "loss": 0.1128, + "step": 208 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.8754212856292725, + "learning_rate": 4.582402327630368e-06, + "loss": 0.1104, + "step": 209 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.846051812171936, + "learning_rate": 4.574930695532357e-06, + "loss": 0.1105, + "step": 210 + }, + { + "epoch": 1.8508771929824561, + "grad_norm": 1.3332515954971313, + "learning_rate": 4.567399018468889e-06, + "loss": 0.1101, + "step": 211 + }, + { + "epoch": 1.8596491228070176, + "grad_norm": 0.8729192614555359, + "learning_rate": 4.5598075143938855e-06, + "loss": 0.1081, + "step": 212 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.8618345260620117, + "learning_rate": 4.552156402992567e-06, + "loss": 0.1059, + "step": 213 + }, + { + "epoch": 1.8771929824561404, + "grad_norm": 1.2135930061340332, + "learning_rate": 4.544445905675082e-06, + "loss": 0.1105, + "step": 214 + }, + { + "epoch": 1.8859649122807016, + "grad_norm": 0.8405666351318359, + "learning_rate": 4.536676245570111e-06, + "loss": 0.1118, + "step": 215 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.42860639095306396, + "learning_rate": 4.528847647518403e-06, + "loss": 0.1093, + "step": 216 + }, + { + "epoch": 1.9035087719298245, + "grad_norm": 1.1538206338882446, + "learning_rate": 4.520960338066271e-06, + "loss": 0.1088, + "step": 217 + }, + { + "epoch": 1.912280701754386, + "grad_norm": 0.5870749354362488, + "learning_rate": 4.513014545459038e-06, + "loss": 0.1061, + "step": 218 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.7279748916625977, + "learning_rate": 4.505010499634427e-06, + "loss": 0.1032, + "step": 219 + }, + { + "epoch": 1.9298245614035088, + "grad_norm": 0.6331414580345154, + "learning_rate": 4.4969484322159125e-06, + "loss": 0.1109, + "step": 220 + }, + { + "epoch": 1.9385964912280702, + "grad_norm": 0.9024543166160583, + "learning_rate": 4.488828576506014e-06, + "loss": 0.1094, + "step": 221 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 3.540376901626587, + "learning_rate": 4.480651167479545e-06, + "loss": 0.1154, + "step": 222 + }, + { + "epoch": 1.956140350877193, + "grad_norm": 0.9506739377975464, + "learning_rate": 4.472416441776817e-06, + "loss": 0.108, + "step": 223 + }, + { + "epoch": 1.9649122807017543, + "grad_norm": 0.6585081815719604, + "learning_rate": 4.464124637696786e-06, + "loss": 0.1033, + "step": 224 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 1.143038034439087, + "learning_rate": 4.455775995190161e-06, + "loss": 0.1092, + "step": 225 + }, + { + "epoch": 1.9824561403508771, + "grad_norm": 1.148261547088623, + "learning_rate": 4.4473707558524555e-06, + "loss": 0.1076, + "step": 226 + }, + { + "epoch": 1.9912280701754386, + "grad_norm": 0.7375811338424683, + "learning_rate": 4.438909162917003e-06, + "loss": 0.108, + "step": 227 + }, + { + "epoch": 2.0, + "grad_norm": 0.5254591703414917, + "learning_rate": 4.430391461247911e-06, + "loss": 0.1079, + "step": 228 + }, + { + "epoch": 2.008771929824561, + "grad_norm": 1.0198495388031006, + "learning_rate": 4.42181789733298e-06, + "loss": 0.1083, + "step": 229 + }, + { + "epoch": 2.017543859649123, + "grad_norm": 0.9234157800674438, + "learning_rate": 4.413188719276569e-06, + "loss": 0.1084, + "step": 230 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 0.5215068459510803, + "learning_rate": 4.404504176792414e-06, + "loss": 0.1067, + "step": 231 + }, + { + "epoch": 2.0350877192982457, + "grad_norm": 0.9296736121177673, + "learning_rate": 4.3957645211964065e-06, + "loss": 0.1066, + "step": 232 + }, + { + "epoch": 2.043859649122807, + "grad_norm": 0.8660671710968018, + "learning_rate": 4.386970005399314e-06, + "loss": 0.108, + "step": 233 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 0.6014883518218994, + "learning_rate": 4.378120883899467e-06, + "loss": 0.1068, + "step": 234 + }, + { + "epoch": 2.06140350877193, + "grad_norm": 0.6370371580123901, + "learning_rate": 4.369217412775393e-06, + "loss": 0.1076, + "step": 235 + }, + { + "epoch": 2.0701754385964914, + "grad_norm": 0.9806828498840332, + "learning_rate": 4.360259849678402e-06, + "loss": 0.1071, + "step": 236 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 0.6093440651893616, + "learning_rate": 4.351248453825137e-06, + "loss": 0.1038, + "step": 237 + }, + { + "epoch": 2.087719298245614, + "grad_norm": 1.3494842052459717, + "learning_rate": 4.3421834859900695e-06, + "loss": 0.1105, + "step": 238 + }, + { + "epoch": 2.0964912280701755, + "grad_norm": 0.7621576189994812, + "learning_rate": 4.333065208497949e-06, + "loss": 0.1048, + "step": 239 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.5918282866477966, + "learning_rate": 4.3238938852162195e-06, + "loss": 0.1086, + "step": 240 + }, + { + "epoch": 2.1140350877192984, + "grad_norm": 0.7048676609992981, + "learning_rate": 4.314669781547379e-06, + "loss": 0.1061, + "step": 241 + }, + { + "epoch": 2.1228070175438596, + "grad_norm": 1.0750821828842163, + "learning_rate": 4.305393164421301e-06, + "loss": 0.1082, + "step": 242 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 0.6171414852142334, + "learning_rate": 4.296064302287507e-06, + "loss": 0.1039, + "step": 243 + }, + { + "epoch": 2.1403508771929824, + "grad_norm": 0.8080905079841614, + "learning_rate": 4.286683465107403e-06, + "loss": 0.1069, + "step": 244 + }, + { + "epoch": 2.1491228070175437, + "grad_norm": 0.5281466245651245, + "learning_rate": 4.277250924346461e-06, + "loss": 0.1069, + "step": 245 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.8070254325866699, + "learning_rate": 4.267766952966369e-06, + "loss": 0.1061, + "step": 246 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.8560577630996704, + "learning_rate": 4.25823182541713e-06, + "loss": 0.1116, + "step": 247 + }, + { + "epoch": 2.175438596491228, + "grad_norm": 0.7772330045700073, + "learning_rate": 4.2486458176291176e-06, + "loss": 0.1092, + "step": 248 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 0.814601719379425, + "learning_rate": 4.239009207005096e-06, + "loss": 0.1093, + "step": 249 + }, + { + "epoch": 2.192982456140351, + "grad_norm": 0.957789957523346, + "learning_rate": 4.2293222724121855e-06, + "loss": 0.1075, + "step": 250 + }, + { + "epoch": 2.2017543859649122, + "grad_norm": 0.500062108039856, + "learning_rate": 4.219585294173799e-06, + "loss": 0.1048, + "step": 251 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.3866419792175293, + "learning_rate": 4.209798554061527e-06, + "loss": 0.1074, + "step": 252 + }, + { + "epoch": 2.219298245614035, + "grad_norm": 1.1853291988372803, + "learning_rate": 4.199962335286985e-06, + "loss": 0.1076, + "step": 253 + }, + { + "epoch": 2.2280701754385963, + "grad_norm": 0.36602887511253357, + "learning_rate": 4.1900769224936125e-06, + "loss": 0.108, + "step": 254 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.2530711889266968, + "learning_rate": 4.180142601748447e-06, + "loss": 0.1041, + "step": 255 + }, + { + "epoch": 2.245614035087719, + "grad_norm": 1.3067054748535156, + "learning_rate": 4.170159660533834e-06, + "loss": 0.1087, + "step": 256 + }, + { + "epoch": 2.254385964912281, + "grad_norm": 0.3442043960094452, + "learning_rate": 4.160128387739114e-06, + "loss": 0.1099, + "step": 257 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 1.174796462059021, + "learning_rate": 4.150049073652262e-06, + "loss": 0.1063, + "step": 258 + }, + { + "epoch": 2.2719298245614037, + "grad_norm": 0.5719411969184875, + "learning_rate": 4.1399220099514845e-06, + "loss": 0.1043, + "step": 259 + }, + { + "epoch": 2.280701754385965, + "grad_norm": 0.7268956303596497, + "learning_rate": 4.129747489696781e-06, + "loss": 0.1038, + "step": 260 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 0.7028316259384155, + "learning_rate": 4.119525807321467e-06, + "loss": 0.1052, + "step": 261 + }, + { + "epoch": 2.2982456140350878, + "grad_norm": 1.015335202217102, + "learning_rate": 4.109257258623644e-06, + "loss": 0.1116, + "step": 262 + }, + { + "epoch": 2.307017543859649, + "grad_norm": 0.7141755819320679, + "learning_rate": 4.098942140757646e-06, + "loss": 0.108, + "step": 263 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.7656403183937073, + "learning_rate": 4.0885807522254435e-06, + "loss": 0.1043, + "step": 264 + }, + { + "epoch": 2.324561403508772, + "grad_norm": 0.43293774127960205, + "learning_rate": 4.078173392867998e-06, + "loss": 0.1048, + "step": 265 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.6755763292312622, + "learning_rate": 4.0677203638565895e-06, + "loss": 0.1064, + "step": 266 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 0.9648827314376831, + "learning_rate": 4.0572219676841e-06, + "loss": 0.1088, + "step": 267 + }, + { + "epoch": 2.3508771929824563, + "grad_norm": 0.32724836468696594, + "learning_rate": 4.046678508156259e-06, + "loss": 0.1077, + "step": 268 + }, + { + "epoch": 2.3596491228070176, + "grad_norm": 0.4696657061576843, + "learning_rate": 4.036090290382855e-06, + "loss": 0.1067, + "step": 269 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.33901306986808777, + "learning_rate": 4.025457620768901e-06, + "loss": 0.105, + "step": 270 + }, + { + "epoch": 2.3771929824561404, + "grad_norm": 0.5703794360160828, + "learning_rate": 4.014780807005775e-06, + "loss": 0.1033, + "step": 271 + }, + { + "epoch": 2.3859649122807016, + "grad_norm": 0.9639355540275574, + "learning_rate": 4.004060158062306e-06, + "loss": 0.1041, + "step": 272 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 0.8851558566093445, + "learning_rate": 3.993295984175845e-06, + "loss": 0.1064, + "step": 273 + }, + { + "epoch": 2.4035087719298245, + "grad_norm": 0.5200062990188599, + "learning_rate": 3.982488596843276e-06, + "loss": 0.1056, + "step": 274 + }, + { + "epoch": 2.412280701754386, + "grad_norm": 1.160823106765747, + "learning_rate": 3.971638308812007e-06, + "loss": 0.1069, + "step": 275 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 1.0191210508346558, + "learning_rate": 3.9607454340709215e-06, + "loss": 0.1042, + "step": 276 + }, + { + "epoch": 2.4298245614035086, + "grad_norm": 0.37181487679481506, + "learning_rate": 3.949810287841289e-06, + "loss": 0.1062, + "step": 277 + }, + { + "epoch": 2.43859649122807, + "grad_norm": 0.9328593611717224, + "learning_rate": 3.9388331865676436e-06, + "loss": 0.1086, + "step": 278 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 0.8024734258651733, + "learning_rate": 3.927814447908625e-06, + "loss": 0.1051, + "step": 279 + }, + { + "epoch": 2.456140350877193, + "grad_norm": 0.9746696352958679, + "learning_rate": 3.916754390727795e-06, + "loss": 0.1041, + "step": 280 + }, + { + "epoch": 2.4649122807017543, + "grad_norm": 0.5457844138145447, + "learning_rate": 3.905653335084394e-06, + "loss": 0.1052, + "step": 281 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 1.0736924409866333, + "learning_rate": 3.8945116022240945e-06, + "loss": 0.1075, + "step": 282 + }, + { + "epoch": 2.482456140350877, + "grad_norm": 0.6335628032684326, + "learning_rate": 3.8833295145696964e-06, + "loss": 0.1036, + "step": 283 + }, + { + "epoch": 2.4912280701754383, + "grad_norm": 0.6909618377685547, + "learning_rate": 3.872107395711799e-06, + "loss": 0.1089, + "step": 284 + }, + { + "epoch": 2.5, + "grad_norm": 2.1871702671051025, + "learning_rate": 3.860845570399435e-06, + "loss": 0.1066, + "step": 285 + }, + { + "epoch": 2.5087719298245617, + "grad_norm": 0.5831722617149353, + "learning_rate": 3.849544364530678e-06, + "loss": 0.1055, + "step": 286 + }, + { + "epoch": 2.517543859649123, + "grad_norm": 0.5302637815475464, + "learning_rate": 3.838204105143204e-06, + "loss": 0.1057, + "step": 287 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.6348035931587219, + "learning_rate": 3.8268251204048335e-06, + "loss": 0.1089, + "step": 288 + }, + { + "epoch": 2.5350877192982457, + "grad_norm": 2.1932008266448975, + "learning_rate": 3.815407739604033e-06, + "loss": 0.1043, + "step": 289 + }, + { + "epoch": 2.543859649122807, + "grad_norm": 0.4388940930366516, + "learning_rate": 3.803952293140385e-06, + "loss": 0.1055, + "step": 290 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 0.6853339076042175, + "learning_rate": 3.7924591125150265e-06, + "loss": 0.1036, + "step": 291 + }, + { + "epoch": 2.56140350877193, + "grad_norm": 0.34744876623153687, + "learning_rate": 3.78092853032106e-06, + "loss": 0.1025, + "step": 292 + }, + { + "epoch": 2.5701754385964914, + "grad_norm": 0.9523847699165344, + "learning_rate": 3.769360880233922e-06, + "loss": 0.1067, + "step": 293 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 1.303745985031128, + "learning_rate": 3.7577564970017338e-06, + "loss": 0.1082, + "step": 294 + }, + { + "epoch": 2.587719298245614, + "grad_norm": 0.9468981623649597, + "learning_rate": 3.7461157164356103e-06, + "loss": 0.1055, + "step": 295 + }, + { + "epoch": 2.5964912280701755, + "grad_norm": 0.7204175591468811, + "learning_rate": 3.7344388753999434e-06, + "loss": 0.1055, + "step": 296 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 0.5110165476799011, + "learning_rate": 3.7227263118026537e-06, + "loss": 0.1092, + "step": 297 + }, + { + "epoch": 2.6140350877192984, + "grad_norm": 0.6483246088027954, + "learning_rate": 3.7109783645854116e-06, + "loss": 0.1078, + "step": 298 + }, + { + "epoch": 2.6228070175438596, + "grad_norm": 0.5058422684669495, + "learning_rate": 3.699195373713831e-06, + "loss": 0.1073, + "step": 299 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.4123518764972687, + "learning_rate": 3.6873776801676265e-06, + "loss": 0.1053, + "step": 300 + }, + { + "epoch": 2.6403508771929824, + "grad_norm": 1.0864709615707397, + "learning_rate": 3.675525625930751e-06, + "loss": 0.1048, + "step": 301 + }, + { + "epoch": 2.6491228070175437, + "grad_norm": 1.0264904499053955, + "learning_rate": 3.6636395539814975e-06, + "loss": 0.1059, + "step": 302 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 0.7724822163581848, + "learning_rate": 3.651719808282573e-06, + "loss": 0.1063, + "step": 303 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.7474755644798279, + "learning_rate": 3.6397667337711475e-06, + "loss": 0.1034, + "step": 304 + }, + { + "epoch": 2.675438596491228, + "grad_norm": 0.5628909468650818, + "learning_rate": 3.6277806763488666e-06, + "loss": 0.1026, + "step": 305 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.9070547819137573, + "learning_rate": 3.6157619828718477e-06, + "loss": 0.1031, + "step": 306 + }, + { + "epoch": 2.692982456140351, + "grad_norm": 0.6968091130256653, + "learning_rate": 3.603711001140641e-06, + "loss": 0.1068, + "step": 307 + }, + { + "epoch": 2.7017543859649122, + "grad_norm": 0.3764977753162384, + "learning_rate": 3.5916280798901604e-06, + "loss": 0.1038, + "step": 308 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 5.012625694274902, + "learning_rate": 3.5795135687795984e-06, + "loss": 0.1129, + "step": 309 + }, + { + "epoch": 2.719298245614035, + "grad_norm": 0.6745572686195374, + "learning_rate": 3.567367818382303e-06, + "loss": 0.1071, + "step": 310 + }, + { + "epoch": 2.7280701754385968, + "grad_norm": 1.0659606456756592, + "learning_rate": 3.555191180175634e-06, + "loss": 0.1067, + "step": 311 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 1.7312604188919067, + "learning_rate": 3.5429840065307924e-06, + "loss": 0.1101, + "step": 312 + }, + { + "epoch": 2.745614035087719, + "grad_norm": 1.100364327430725, + "learning_rate": 3.5307466507026223e-06, + "loss": 0.1098, + "step": 313 + }, + { + "epoch": 2.754385964912281, + "grad_norm": 1.0390428304672241, + "learning_rate": 3.5184794668193893e-06, + "loss": 0.1094, + "step": 314 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.3369971811771393, + "learning_rate": 3.5061828098725327e-06, + "loss": 0.1053, + "step": 315 + }, + { + "epoch": 2.7719298245614032, + "grad_norm": 0.6130257248878479, + "learning_rate": 3.4938570357063906e-06, + "loss": 0.106, + "step": 316 + }, + { + "epoch": 2.780701754385965, + "grad_norm": 0.6387595534324646, + "learning_rate": 3.481502501007904e-06, + "loss": 0.1044, + "step": 317 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 1.0731587409973145, + "learning_rate": 3.469119563296296e-06, + "loss": 0.1097, + "step": 318 + }, + { + "epoch": 2.7982456140350878, + "grad_norm": 0.8096229434013367, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.1076, + "step": 319 + }, + { + "epoch": 2.807017543859649, + "grad_norm": 0.5034844279289246, + "learning_rate": 3.444269913009912e-06, + "loss": 0.1071, + "step": 320 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 0.675139307975769, + "learning_rate": 3.4318039195417536e-06, + "loss": 0.1039, + "step": 321 + }, + { + "epoch": 2.824561403508772, + "grad_norm": 0.7330355644226074, + "learning_rate": 3.4193109612528972e-06, + "loss": 0.1044, + "step": 322 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.6558271646499634, + "learning_rate": 3.4067913996683115e-06, + "loss": 0.1051, + "step": 323 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.8411844372749329, + "learning_rate": 3.3942455970828146e-06, + "loss": 0.1063, + "step": 324 + }, + { + "epoch": 2.8508771929824563, + "grad_norm": 0.4817325174808502, + "learning_rate": 3.3816739165505964e-06, + "loss": 0.105, + "step": 325 + }, + { + "epoch": 2.8596491228070176, + "grad_norm": 0.424554705619812, + "learning_rate": 3.3690767218747104e-06, + "loss": 0.1037, + "step": 326 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 1.0054417848587036, + "learning_rate": 3.3564543775965475e-06, + "loss": 0.1058, + "step": 327 + }, + { + "epoch": 2.8771929824561404, + "grad_norm": 0.8984584808349609, + "learning_rate": 3.3438072489852837e-06, + "loss": 0.1079, + "step": 328 + }, + { + "epoch": 2.8859649122807016, + "grad_norm": 0.6779558062553406, + "learning_rate": 3.331135702027311e-06, + "loss": 0.1046, + "step": 329 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.6931657195091248, + "learning_rate": 3.318440103415649e-06, + "loss": 0.1106, + "step": 330 + }, + { + "epoch": 2.9035087719298245, + "grad_norm": 0.705264151096344, + "learning_rate": 3.305720820539329e-06, + "loss": 0.104, + "step": 331 + }, + { + "epoch": 2.912280701754386, + "grad_norm": 0.7799407839775085, + "learning_rate": 3.2929782214727657e-06, + "loss": 0.1019, + "step": 332 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 0.7583760619163513, + "learning_rate": 3.2802126749651042e-06, + "loss": 0.1049, + "step": 333 + }, + { + "epoch": 2.9298245614035086, + "grad_norm": 0.6145837306976318, + "learning_rate": 3.2674245504295505e-06, + "loss": 0.104, + "step": 334 + }, + { + "epoch": 2.93859649122807, + "grad_norm": 0.5170779228210449, + "learning_rate": 3.254614217932679e-06, + "loss": 0.1024, + "step": 335 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.6850940585136414, + "learning_rate": 3.241782048183726e-06, + "loss": 0.1047, + "step": 336 + }, + { + "epoch": 2.956140350877193, + "grad_norm": 0.7307694554328918, + "learning_rate": 3.2289284125238597e-06, + "loss": 0.1032, + "step": 337 + }, + { + "epoch": 2.9649122807017543, + "grad_norm": 0.3386179208755493, + "learning_rate": 3.216053682915436e-06, + "loss": 0.1037, + "step": 338 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 0.7565059065818787, + "learning_rate": 3.203158231931234e-06, + "loss": 0.1048, + "step": 339 + }, + { + "epoch": 2.982456140350877, + "grad_norm": 0.7902039289474487, + "learning_rate": 3.190242432743673e-06, + "loss": 0.1068, + "step": 340 + }, + { + "epoch": 2.9912280701754383, + "grad_norm": 0.42595192790031433, + "learning_rate": 3.177306659114015e-06, + "loss": 0.1039, + "step": 341 + }, + { + "epoch": 3.0, + "grad_norm": 1.1214542388916016, + "learning_rate": 3.164351285381549e-06, + "loss": 0.1062, + "step": 342 + }, + { + "epoch": 3.008771929824561, + "grad_norm": 0.7622955441474915, + "learning_rate": 3.1513766864527577e-06, + "loss": 0.1015, + "step": 343 + }, + { + "epoch": 3.017543859649123, + "grad_norm": 0.2676297724246979, + "learning_rate": 3.1383832377904676e-06, + "loss": 0.1037, + "step": 344 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 0.8695605397224426, + "learning_rate": 3.1253713154029857e-06, + "loss": 0.1056, + "step": 345 + }, + { + "epoch": 3.0350877192982457, + "grad_norm": 0.5875906944274902, + "learning_rate": 3.1123412958332155e-06, + "loss": 0.1067, + "step": 346 + }, + { + "epoch": 3.043859649122807, + "grad_norm": 0.7699372172355652, + "learning_rate": 3.0992935561477632e-06, + "loss": 0.1035, + "step": 347 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 0.5919204354286194, + "learning_rate": 3.0862284739260247e-06, + "loss": 0.1023, + "step": 348 + }, + { + "epoch": 3.06140350877193, + "grad_norm": 1.3211849927902222, + "learning_rate": 3.07314642724926e-06, + "loss": 0.1065, + "step": 349 + }, + { + "epoch": 3.0701754385964914, + "grad_norm": 0.6359637379646301, + "learning_rate": 3.0600477946896494e-06, + "loss": 0.106, + "step": 350 + }, + { + "epoch": 3.0789473684210527, + "grad_norm": 0.35776662826538086, + "learning_rate": 3.046932955299344e-06, + "loss": 0.1046, + "step": 351 + }, + { + "epoch": 3.087719298245614, + "grad_norm": 0.6657406687736511, + "learning_rate": 3.0338022885994904e-06, + "loss": 0.1076, + "step": 352 + }, + { + "epoch": 3.0964912280701755, + "grad_norm": 0.7587785720825195, + "learning_rate": 3.0206561745692512e-06, + "loss": 0.1043, + "step": 353 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 1.1258317232131958, + "learning_rate": 3.0074949936348084e-06, + "loss": 0.1043, + "step": 354 + }, + { + "epoch": 3.1140350877192984, + "grad_norm": 0.3570568263530731, + "learning_rate": 2.9943191266583564e-06, + "loss": 0.1032, + "step": 355 + }, + { + "epoch": 3.1228070175438596, + "grad_norm": 0.843485414981842, + "learning_rate": 2.981128954927075e-06, + "loss": 0.1045, + "step": 356 + }, + { + "epoch": 3.1315789473684212, + "grad_norm": 0.5719651579856873, + "learning_rate": 2.967924860142103e-06, + "loss": 0.1052, + "step": 357 + }, + { + "epoch": 3.1403508771929824, + "grad_norm": 2.20767879486084, + "learning_rate": 2.9547072244074853e-06, + "loss": 0.1078, + "step": 358 + }, + { + "epoch": 3.1491228070175437, + "grad_norm": 0.3715457022190094, + "learning_rate": 2.941476430219122e-06, + "loss": 0.1047, + "step": 359 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.7803200483322144, + "learning_rate": 2.928232860453694e-06, + "loss": 0.1029, + "step": 360 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.5198164582252502, + "learning_rate": 2.9149768983575884e-06, + "loss": 0.1032, + "step": 361 + }, + { + "epoch": 3.175438596491228, + "grad_norm": 0.7827185988426208, + "learning_rate": 2.9017089275358017e-06, + "loss": 0.1043, + "step": 362 + }, + { + "epoch": 3.1842105263157894, + "grad_norm": 0.4000351130962372, + "learning_rate": 2.8884293319408464e-06, + "loss": 0.1071, + "step": 363 + }, + { + "epoch": 3.192982456140351, + "grad_norm": 0.9913386106491089, + "learning_rate": 2.8751384958616318e-06, + "loss": 0.1022, + "step": 364 + }, + { + "epoch": 3.2017543859649122, + "grad_norm": 0.6975695490837097, + "learning_rate": 2.861836803912353e-06, + "loss": 0.1029, + "step": 365 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 0.2372695654630661, + "learning_rate": 2.8485246410213497e-06, + "loss": 0.1015, + "step": 366 + }, + { + "epoch": 3.219298245614035, + "grad_norm": 0.447732537984848, + "learning_rate": 2.835202392419977e-06, + "loss": 0.1052, + "step": 367 + }, + { + "epoch": 3.2280701754385963, + "grad_norm": 0.6617346405982971, + "learning_rate": 2.8218704436314525e-06, + "loss": 0.1055, + "step": 368 + }, + { + "epoch": 3.236842105263158, + "grad_norm": 0.5550402402877808, + "learning_rate": 2.8085291804596995e-06, + "loss": 0.102, + "step": 369 + }, + { + "epoch": 3.245614035087719, + "grad_norm": 0.6046020984649658, + "learning_rate": 2.795178988978185e-06, + "loss": 0.1036, + "step": 370 + }, + { + "epoch": 3.254385964912281, + "grad_norm": 0.41890618205070496, + "learning_rate": 2.781820255518745e-06, + "loss": 0.1036, + "step": 371 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 0.8387415409088135, + "learning_rate": 2.768453366660408e-06, + "loss": 0.1076, + "step": 372 + }, + { + "epoch": 3.2719298245614037, + "grad_norm": 0.5318773984909058, + "learning_rate": 2.755078709218203e-06, + "loss": 0.1052, + "step": 373 + }, + { + "epoch": 3.280701754385965, + "grad_norm": 0.6617523431777954, + "learning_rate": 2.741696670231969e-06, + "loss": 0.1049, + "step": 374 + }, + { + "epoch": 3.2894736842105265, + "grad_norm": 1.0190025568008423, + "learning_rate": 2.728307636955156e-06, + "loss": 0.1034, + "step": 375 + }, + { + "epoch": 3.2982456140350878, + "grad_norm": 0.6924716234207153, + "learning_rate": 2.714911996843617e-06, + "loss": 0.1065, + "step": 376 + }, + { + "epoch": 3.307017543859649, + "grad_norm": 0.42501118779182434, + "learning_rate": 2.701510137544393e-06, + "loss": 0.1019, + "step": 377 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 0.844886064529419, + "learning_rate": 2.6881024468845e-06, + "loss": 0.1047, + "step": 378 + }, + { + "epoch": 3.324561403508772, + "grad_norm": 0.46512728929519653, + "learning_rate": 2.674689312859704e-06, + "loss": 0.1043, + "step": 379 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6242017149925232, + "learning_rate": 2.6612711236232915e-06, + "loss": 0.1046, + "step": 380 + }, + { + "epoch": 3.3421052631578947, + "grad_norm": 0.6578526496887207, + "learning_rate": 2.6478482674748375e-06, + "loss": 0.1031, + "step": 381 + }, + { + "epoch": 3.3508771929824563, + "grad_norm": 0.4822542667388916, + "learning_rate": 2.63442113284897e-06, + "loss": 0.1053, + "step": 382 + }, + { + "epoch": 3.3596491228070176, + "grad_norm": 0.48255595564842224, + "learning_rate": 2.6209901083041307e-06, + "loss": 0.1058, + "step": 383 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.6624025702476501, + "learning_rate": 2.6075555825113265e-06, + "loss": 0.1066, + "step": 384 + }, + { + "epoch": 3.3771929824561404, + "grad_norm": 0.6962618827819824, + "learning_rate": 2.5941179442428864e-06, + "loss": 0.102, + "step": 385 + }, + { + "epoch": 3.3859649122807016, + "grad_norm": 0.4976450502872467, + "learning_rate": 2.580677582361208e-06, + "loss": 0.1011, + "step": 386 + }, + { + "epoch": 3.3947368421052633, + "grad_norm": 0.5283737182617188, + "learning_rate": 2.5672348858075053e-06, + "loss": 0.1057, + "step": 387 + }, + { + "epoch": 3.4035087719298245, + "grad_norm": 0.32338738441467285, + "learning_rate": 2.553790243590556e-06, + "loss": 0.1015, + "step": 388 + }, + { + "epoch": 3.412280701754386, + "grad_norm": 0.7909435629844666, + "learning_rate": 2.5403440447754385e-06, + "loss": 0.1036, + "step": 389 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 0.6297115087509155, + "learning_rate": 2.5268966784722792e-06, + "loss": 0.1042, + "step": 390 + }, + { + "epoch": 3.4298245614035086, + "grad_norm": 0.32988762855529785, + "learning_rate": 2.513448533824988e-06, + "loss": 0.1059, + "step": 391 + }, + { + "epoch": 3.43859649122807, + "grad_norm": 0.9211220145225525, + "learning_rate": 2.5e-06, + "loss": 0.1015, + "step": 392 + }, + { + "epoch": 3.4473684210526314, + "grad_norm": 1.2157588005065918, + "learning_rate": 2.486551466175013e-06, + "loss": 0.1035, + "step": 393 + }, + { + "epoch": 3.456140350877193, + "grad_norm": 0.4786648452281952, + "learning_rate": 2.4731033215277216e-06, + "loss": 0.1026, + "step": 394 + }, + { + "epoch": 3.4649122807017543, + "grad_norm": 0.37398242950439453, + "learning_rate": 2.4596559552245623e-06, + "loss": 0.1044, + "step": 395 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 0.5536217093467712, + "learning_rate": 2.446209756409445e-06, + "loss": 0.1043, + "step": 396 + }, + { + "epoch": 3.482456140350877, + "grad_norm": 0.708406925201416, + "learning_rate": 2.432765114192495e-06, + "loss": 0.1046, + "step": 397 + }, + { + "epoch": 3.4912280701754383, + "grad_norm": 0.7140893340110779, + "learning_rate": 2.4193224176387926e-06, + "loss": 0.1039, + "step": 398 + }, + { + "epoch": 3.5, + "grad_norm": 0.8078088760375977, + "learning_rate": 2.4058820557571144e-06, + "loss": 0.1013, + "step": 399 + }, + { + "epoch": 3.5087719298245617, + "grad_norm": 0.7129591107368469, + "learning_rate": 2.3924444174886735e-06, + "loss": 0.1057, + "step": 400 + }, + { + "epoch": 3.517543859649123, + "grad_norm": 1.293412446975708, + "learning_rate": 2.37900989169587e-06, + "loss": 0.1081, + "step": 401 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 0.7235314249992371, + "learning_rate": 2.3655788671510314e-06, + "loss": 0.1054, + "step": 402 + }, + { + "epoch": 3.5350877192982457, + "grad_norm": 0.6008841395378113, + "learning_rate": 2.3521517325251637e-06, + "loss": 0.1033, + "step": 403 + }, + { + "epoch": 3.543859649122807, + "grad_norm": 0.6819609999656677, + "learning_rate": 2.3387288763767097e-06, + "loss": 0.1019, + "step": 404 + }, + { + "epoch": 3.5526315789473686, + "grad_norm": 0.5696406960487366, + "learning_rate": 2.325310687140296e-06, + "loss": 0.1043, + "step": 405 + }, + { + "epoch": 3.56140350877193, + "grad_norm": 0.8597077131271362, + "learning_rate": 2.3118975531155003e-06, + "loss": 0.1037, + "step": 406 + }, + { + "epoch": 3.5701754385964914, + "grad_norm": 0.43985217809677124, + "learning_rate": 2.2984898624556075e-06, + "loss": 0.105, + "step": 407 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.5448469519615173, + "learning_rate": 2.2850880031563845e-06, + "loss": 0.1037, + "step": 408 + }, + { + "epoch": 3.587719298245614, + "grad_norm": 0.8221977949142456, + "learning_rate": 2.271692363044845e-06, + "loss": 0.1015, + "step": 409 + }, + { + "epoch": 3.5964912280701755, + "grad_norm": 0.9838594198226929, + "learning_rate": 2.2583033297680316e-06, + "loss": 0.1085, + "step": 410 + }, + { + "epoch": 3.6052631578947367, + "grad_norm": 1.034848928451538, + "learning_rate": 2.2449212907817985e-06, + "loss": 0.104, + "step": 411 + }, + { + "epoch": 3.6140350877192984, + "grad_norm": 1.0788371562957764, + "learning_rate": 2.2315466333395927e-06, + "loss": 0.1033, + "step": 412 + }, + { + "epoch": 3.6228070175438596, + "grad_norm": 0.49096915125846863, + "learning_rate": 2.2181797444812557e-06, + "loss": 0.1044, + "step": 413 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 1.309685230255127, + "learning_rate": 2.204821011021815e-06, + "loss": 0.1036, + "step": 414 + }, + { + "epoch": 3.6403508771929824, + "grad_norm": 0.5014146566390991, + "learning_rate": 2.191470819540301e-06, + "loss": 0.104, + "step": 415 + }, + { + "epoch": 3.6491228070175437, + "grad_norm": 0.770470380783081, + "learning_rate": 2.178129556368548e-06, + "loss": 0.1049, + "step": 416 + }, + { + "epoch": 3.6578947368421053, + "grad_norm": 0.4639376699924469, + "learning_rate": 2.1647976075800235e-06, + "loss": 0.1047, + "step": 417 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 1.101885437965393, + "learning_rate": 2.151475358978652e-06, + "loss": 0.1035, + "step": 418 + }, + { + "epoch": 3.675438596491228, + "grad_norm": 0.5644329786300659, + "learning_rate": 2.138163196087648e-06, + "loss": 0.103, + "step": 419 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 1.1015008687973022, + "learning_rate": 2.1248615041383686e-06, + "loss": 0.1054, + "step": 420 + }, + { + "epoch": 3.692982456140351, + "grad_norm": 0.7311366200447083, + "learning_rate": 2.111570668059155e-06, + "loss": 0.1043, + "step": 421 + }, + { + "epoch": 3.7017543859649122, + "grad_norm": 0.38242173194885254, + "learning_rate": 2.098291072464199e-06, + "loss": 0.1041, + "step": 422 + }, + { + "epoch": 3.7105263157894735, + "grad_norm": 1.231512188911438, + "learning_rate": 2.085023101642412e-06, + "loss": 0.1021, + "step": 423 + }, + { + "epoch": 3.719298245614035, + "grad_norm": 0.41761213541030884, + "learning_rate": 2.0717671395463063e-06, + "loss": 0.1062, + "step": 424 + }, + { + "epoch": 3.7280701754385968, + "grad_norm": 0.4593309462070465, + "learning_rate": 2.0585235697808794e-06, + "loss": 0.1012, + "step": 425 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 0.9147135019302368, + "learning_rate": 2.0452927755925155e-06, + "loss": 0.1046, + "step": 426 + }, + { + "epoch": 3.745614035087719, + "grad_norm": 0.39639535546302795, + "learning_rate": 2.0320751398578984e-06, + "loss": 0.1018, + "step": 427 + }, + { + "epoch": 3.754385964912281, + "grad_norm": 0.688010573387146, + "learning_rate": 2.0188710450729255e-06, + "loss": 0.104, + "step": 428 + }, + { + "epoch": 3.763157894736842, + "grad_norm": 0.5140353441238403, + "learning_rate": 2.005680873341644e-06, + "loss": 0.1033, + "step": 429 + }, + { + "epoch": 3.7719298245614032, + "grad_norm": 0.5970481634140015, + "learning_rate": 1.992505006365191e-06, + "loss": 0.1044, + "step": 430 + }, + { + "epoch": 3.780701754385965, + "grad_norm": 0.551162838935852, + "learning_rate": 1.9793438254307496e-06, + "loss": 0.1042, + "step": 431 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.5344637632369995, + "learning_rate": 1.96619771140051e-06, + "loss": 0.1042, + "step": 432 + }, + { + "epoch": 3.7982456140350878, + "grad_norm": 0.5357667207717896, + "learning_rate": 1.9530670447006566e-06, + "loss": 0.101, + "step": 433 + }, + { + "epoch": 3.807017543859649, + "grad_norm": 1.2536660432815552, + "learning_rate": 1.9399522053103514e-06, + "loss": 0.1008, + "step": 434 + }, + { + "epoch": 3.8157894736842106, + "grad_norm": 0.4888289272785187, + "learning_rate": 1.926853572750741e-06, + "loss": 0.1028, + "step": 435 + }, + { + "epoch": 3.824561403508772, + "grad_norm": 0.5810404419898987, + "learning_rate": 1.913771526073976e-06, + "loss": 0.1031, + "step": 436 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.5372979044914246, + "learning_rate": 1.9007064438522374e-06, + "loss": 0.107, + "step": 437 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 0.8293616771697998, + "learning_rate": 1.8876587041667855e-06, + "loss": 0.1033, + "step": 438 + }, + { + "epoch": 3.8508771929824563, + "grad_norm": 2.361504554748535, + "learning_rate": 1.8746286845970145e-06, + "loss": 0.1098, + "step": 439 + }, + { + "epoch": 3.8596491228070176, + "grad_norm": 0.70230633020401, + "learning_rate": 1.8616167622095328e-06, + "loss": 0.1034, + "step": 440 + }, + { + "epoch": 3.8684210526315788, + "grad_norm": 0.6323564052581787, + "learning_rate": 1.8486233135472436e-06, + "loss": 0.1058, + "step": 441 + }, + { + "epoch": 3.8771929824561404, + "grad_norm": 0.48205408453941345, + "learning_rate": 1.8356487146184517e-06, + "loss": 0.105, + "step": 442 + }, + { + "epoch": 3.8859649122807016, + "grad_norm": 0.6996872425079346, + "learning_rate": 1.8226933408859864e-06, + "loss": 0.1083, + "step": 443 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 0.4114651679992676, + "learning_rate": 1.8097575672563278e-06, + "loss": 0.1003, + "step": 444 + }, + { + "epoch": 3.9035087719298245, + "grad_norm": 0.5234648585319519, + "learning_rate": 1.7968417680687666e-06, + "loss": 0.1019, + "step": 445 + }, + { + "epoch": 3.912280701754386, + "grad_norm": 1.0571491718292236, + "learning_rate": 1.7839463170845641e-06, + "loss": 0.1003, + "step": 446 + }, + { + "epoch": 3.9210526315789473, + "grad_norm": 0.7470094561576843, + "learning_rate": 1.7710715874761408e-06, + "loss": 0.1061, + "step": 447 + }, + { + "epoch": 3.9298245614035086, + "grad_norm": 0.901695191860199, + "learning_rate": 1.7582179518162742e-06, + "loss": 0.1015, + "step": 448 + }, + { + "epoch": 3.93859649122807, + "grad_norm": 1.0251179933547974, + "learning_rate": 1.7453857820673215e-06, + "loss": 0.1, + "step": 449 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 0.5065406560897827, + "learning_rate": 1.7325754495704508e-06, + "loss": 0.1036, + "step": 450 + }, + { + "epoch": 3.956140350877193, + "grad_norm": 0.9541155099868774, + "learning_rate": 1.7197873250348962e-06, + "loss": 0.1015, + "step": 451 + }, + { + "epoch": 3.9649122807017543, + "grad_norm": 0.6264199018478394, + "learning_rate": 1.7070217785272354e-06, + "loss": 0.1026, + "step": 452 + }, + { + "epoch": 3.973684210526316, + "grad_norm": 0.6260526180267334, + "learning_rate": 1.6942791794606716e-06, + "loss": 0.1039, + "step": 453 + }, + { + "epoch": 3.982456140350877, + "grad_norm": 0.4730931222438812, + "learning_rate": 1.681559896584352e-06, + "loss": 0.1045, + "step": 454 + }, + { + "epoch": 3.9912280701754383, + "grad_norm": 0.5011451840400696, + "learning_rate": 1.668864297972689e-06, + "loss": 0.1062, + "step": 455 + }, + { + "epoch": 4.0, + "grad_norm": 1.0113046169281006, + "learning_rate": 1.6561927510147172e-06, + "loss": 0.1005, + "step": 456 + }, + { + "epoch": 4.008771929824562, + "grad_norm": 0.6017364263534546, + "learning_rate": 1.6435456224034536e-06, + "loss": 0.1042, + "step": 457 + }, + { + "epoch": 4.017543859649122, + "grad_norm": 0.6874931454658508, + "learning_rate": 1.63092327812529e-06, + "loss": 0.102, + "step": 458 + }, + { + "epoch": 4.026315789473684, + "grad_norm": 1.311024785041809, + "learning_rate": 1.6183260834494053e-06, + "loss": 0.1063, + "step": 459 + }, + { + "epoch": 4.035087719298246, + "grad_norm": 0.3640352785587311, + "learning_rate": 1.6057544029171863e-06, + "loss": 0.1039, + "step": 460 + }, + { + "epoch": 4.043859649122807, + "grad_norm": 0.6056526303291321, + "learning_rate": 1.5932086003316893e-06, + "loss": 0.099, + "step": 461 + }, + { + "epoch": 4.052631578947368, + "grad_norm": 0.5407683849334717, + "learning_rate": 1.5806890387471025e-06, + "loss": 0.1038, + "step": 462 + }, + { + "epoch": 4.06140350877193, + "grad_norm": 0.7054030895233154, + "learning_rate": 1.5681960804582474e-06, + "loss": 0.1001, + "step": 463 + }, + { + "epoch": 4.0701754385964914, + "grad_norm": 0.8736140727996826, + "learning_rate": 1.5557300869900876e-06, + "loss": 0.1035, + "step": 464 + }, + { + "epoch": 4.078947368421052, + "grad_norm": 0.6689419746398926, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.1052, + "step": 465 + }, + { + "epoch": 4.087719298245614, + "grad_norm": 0.8937819600105286, + "learning_rate": 1.530880436703705e-06, + "loss": 0.1024, + "step": 466 + }, + { + "epoch": 4.0964912280701755, + "grad_norm": 0.24332484602928162, + "learning_rate": 1.518497498992097e-06, + "loss": 0.0984, + "step": 467 + }, + { + "epoch": 4.105263157894737, + "grad_norm": 0.9716914296150208, + "learning_rate": 1.5061429642936107e-06, + "loss": 0.1012, + "step": 468 + }, + { + "epoch": 4.114035087719298, + "grad_norm": 0.5864392518997192, + "learning_rate": 1.4938171901274678e-06, + "loss": 0.1029, + "step": 469 + }, + { + "epoch": 4.12280701754386, + "grad_norm": 0.4616212546825409, + "learning_rate": 1.4815205331806113e-06, + "loss": 0.1035, + "step": 470 + }, + { + "epoch": 4.131578947368421, + "grad_norm": 0.5989730954170227, + "learning_rate": 1.4692533492973775e-06, + "loss": 0.1036, + "step": 471 + }, + { + "epoch": 4.140350877192983, + "grad_norm": 0.7900629639625549, + "learning_rate": 1.4570159934692085e-06, + "loss": 0.1044, + "step": 472 + }, + { + "epoch": 4.149122807017544, + "grad_norm": 0.5659995675086975, + "learning_rate": 1.4448088198243668e-06, + "loss": 0.1024, + "step": 473 + }, + { + "epoch": 4.157894736842105, + "grad_norm": 0.7867873311042786, + "learning_rate": 1.432632181617698e-06, + "loss": 0.1038, + "step": 474 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.44385358691215515, + "learning_rate": 1.4204864312204033e-06, + "loss": 0.1006, + "step": 475 + }, + { + "epoch": 4.175438596491228, + "grad_norm": 0.3909265697002411, + "learning_rate": 1.4083719201098404e-06, + "loss": 0.1019, + "step": 476 + }, + { + "epoch": 4.184210526315789, + "grad_norm": 0.7079223990440369, + "learning_rate": 1.3962889988593609e-06, + "loss": 0.1019, + "step": 477 + }, + { + "epoch": 4.192982456140351, + "grad_norm": 0.6703695058822632, + "learning_rate": 1.3842380171281522e-06, + "loss": 0.1063, + "step": 478 + }, + { + "epoch": 4.201754385964913, + "grad_norm": 0.3477051556110382, + "learning_rate": 1.3722193236511344e-06, + "loss": 0.1004, + "step": 479 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 0.7296048402786255, + "learning_rate": 1.3602332662288536e-06, + "loss": 0.1057, + "step": 480 + }, + { + "epoch": 4.219298245614035, + "grad_norm": 0.7007803916931152, + "learning_rate": 1.348280191717427e-06, + "loss": 0.1007, + "step": 481 + }, + { + "epoch": 4.228070175438597, + "grad_norm": 0.948968231678009, + "learning_rate": 1.3363604460185031e-06, + "loss": 0.1005, + "step": 482 + }, + { + "epoch": 4.2368421052631575, + "grad_norm": 0.6567812561988831, + "learning_rate": 1.3244743740692496e-06, + "loss": 0.1016, + "step": 483 + }, + { + "epoch": 4.245614035087719, + "grad_norm": 0.5390146374702454, + "learning_rate": 1.3126223198323752e-06, + "loss": 0.1025, + "step": 484 + }, + { + "epoch": 4.254385964912281, + "grad_norm": 0.43638724088668823, + "learning_rate": 1.3008046262861696e-06, + "loss": 0.1053, + "step": 485 + }, + { + "epoch": 4.2631578947368425, + "grad_norm": 0.43589839339256287, + "learning_rate": 1.289021635414589e-06, + "loss": 0.1036, + "step": 486 + }, + { + "epoch": 4.271929824561403, + "grad_norm": 0.3999694585800171, + "learning_rate": 1.277273688197346e-06, + "loss": 0.1023, + "step": 487 + }, + { + "epoch": 4.280701754385965, + "grad_norm": 0.6314297914505005, + "learning_rate": 1.265561124600057e-06, + "loss": 0.0993, + "step": 488 + }, + { + "epoch": 4.2894736842105265, + "grad_norm": 0.566033124923706, + "learning_rate": 1.2538842835643906e-06, + "loss": 0.1029, + "step": 489 + }, + { + "epoch": 4.298245614035087, + "grad_norm": 0.6713336110115051, + "learning_rate": 1.2422435029982669e-06, + "loss": 0.1002, + "step": 490 + }, + { + "epoch": 4.307017543859649, + "grad_norm": 0.428574800491333, + "learning_rate": 1.2306391197660797e-06, + "loss": 0.1028, + "step": 491 + }, + { + "epoch": 4.315789473684211, + "grad_norm": 0.637745201587677, + "learning_rate": 1.219071469678941e-06, + "loss": 0.1009, + "step": 492 + }, + { + "epoch": 4.324561403508772, + "grad_norm": 0.8204445242881775, + "learning_rate": 1.2075408874849747e-06, + "loss": 0.099, + "step": 493 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 1.010758876800537, + "learning_rate": 1.1960477068596155e-06, + "loss": 0.1006, + "step": 494 + }, + { + "epoch": 4.342105263157895, + "grad_norm": 0.908112108707428, + "learning_rate": 1.1845922603959677e-06, + "loss": 0.1047, + "step": 495 + }, + { + "epoch": 4.350877192982456, + "grad_norm": 1.0254642963409424, + "learning_rate": 1.173174879595166e-06, + "loss": 0.0991, + "step": 496 + }, + { + "epoch": 4.359649122807017, + "grad_norm": 0.5159414410591125, + "learning_rate": 1.1617958948567967e-06, + "loss": 0.0978, + "step": 497 + }, + { + "epoch": 4.368421052631579, + "grad_norm": 0.9525816440582275, + "learning_rate": 1.1504556354693227e-06, + "loss": 0.1051, + "step": 498 + }, + { + "epoch": 4.37719298245614, + "grad_norm": 0.9321548938751221, + "learning_rate": 1.1391544296005652e-06, + "loss": 0.1011, + "step": 499 + }, + { + "epoch": 4.385964912280702, + "grad_norm": 0.7308889627456665, + "learning_rate": 1.1278926042882026e-06, + "loss": 0.1002, + "step": 500 + }, + { + "epoch": 4.394736842105263, + "grad_norm": 0.9508903622627258, + "learning_rate": 1.116670485430304e-06, + "loss": 0.1013, + "step": 501 + }, + { + "epoch": 4.4035087719298245, + "grad_norm": 0.5174031853675842, + "learning_rate": 1.1054883977759067e-06, + "loss": 0.104, + "step": 502 + }, + { + "epoch": 4.412280701754386, + "grad_norm": 0.4504610598087311, + "learning_rate": 1.0943466649156061e-06, + "loss": 0.1013, + "step": 503 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 0.5650261044502258, + "learning_rate": 1.0832456092722063e-06, + "loss": 0.0995, + "step": 504 + }, + { + "epoch": 4.4298245614035086, + "grad_norm": 0.37759432196617126, + "learning_rate": 1.0721855520913751e-06, + "loss": 0.1058, + "step": 505 + }, + { + "epoch": 4.43859649122807, + "grad_norm": 0.7238495349884033, + "learning_rate": 1.0611668134323577e-06, + "loss": 0.1012, + "step": 506 + }, + { + "epoch": 4.447368421052632, + "grad_norm": 0.6301494240760803, + "learning_rate": 1.0501897121587127e-06, + "loss": 0.1009, + "step": 507 + }, + { + "epoch": 4.456140350877193, + "grad_norm": 0.9531002044677734, + "learning_rate": 1.0392545659290789e-06, + "loss": 0.1021, + "step": 508 + }, + { + "epoch": 4.464912280701754, + "grad_norm": 0.4423767924308777, + "learning_rate": 1.0283616911879943e-06, + "loss": 0.1024, + "step": 509 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 0.5573019981384277, + "learning_rate": 1.0175114031567246e-06, + "loss": 0.1011, + "step": 510 + }, + { + "epoch": 4.482456140350878, + "grad_norm": 0.9792631268501282, + "learning_rate": 1.0067040158241555e-06, + "loss": 0.1039, + "step": 511 + }, + { + "epoch": 4.491228070175438, + "grad_norm": 1.7911303043365479, + "learning_rate": 9.95939841937693e-07, + "loss": 0.104, + "step": 512 + }, + { + "epoch": 4.5, + "grad_norm": 0.5825617909431458, + "learning_rate": 9.852191929942262e-07, + "loss": 0.0987, + "step": 513 + }, + { + "epoch": 4.508771929824562, + "grad_norm": 0.3129921555519104, + "learning_rate": 9.745423792310996e-07, + "loss": 0.0979, + "step": 514 + }, + { + "epoch": 4.517543859649123, + "grad_norm": 0.5376678705215454, + "learning_rate": 9.63909709617146e-07, + "loss": 0.0998, + "step": 515 + }, + { + "epoch": 4.526315789473684, + "grad_norm": 0.48920008540153503, + "learning_rate": 9.533214918437422e-07, + "loss": 0.1017, + "step": 516 + }, + { + "epoch": 4.535087719298246, + "grad_norm": 0.36829131841659546, + "learning_rate": 9.427780323159006e-07, + "loss": 0.1004, + "step": 517 + }, + { + "epoch": 4.543859649122807, + "grad_norm": 0.5459544658660889, + "learning_rate": 9.322796361434111e-07, + "loss": 0.1041, + "step": 518 + }, + { + "epoch": 4.552631578947368, + "grad_norm": 0.8460657000541687, + "learning_rate": 9.218266071320015e-07, + "loss": 0.1012, + "step": 519 + }, + { + "epoch": 4.56140350877193, + "grad_norm": 0.7692683339118958, + "learning_rate": 9.114192477745568e-07, + "loss": 0.1013, + "step": 520 + }, + { + "epoch": 4.5701754385964914, + "grad_norm": 0.4503592550754547, + "learning_rate": 9.010578592423544e-07, + "loss": 0.107, + "step": 521 + }, + { + "epoch": 4.578947368421053, + "grad_norm": 0.9348855018615723, + "learning_rate": 8.907427413763572e-07, + "loss": 0.102, + "step": 522 + }, + { + "epoch": 4.587719298245614, + "grad_norm": 0.7902988791465759, + "learning_rate": 8.804741926785335e-07, + "loss": 0.1032, + "step": 523 + }, + { + "epoch": 4.5964912280701755, + "grad_norm": 0.5444673299789429, + "learning_rate": 8.702525103032186e-07, + "loss": 0.0993, + "step": 524 + }, + { + "epoch": 4.605263157894737, + "grad_norm": 0.728112518787384, + "learning_rate": 8.60077990048517e-07, + "loss": 0.1021, + "step": 525 + }, + { + "epoch": 4.614035087719298, + "grad_norm": 0.5250695943832397, + "learning_rate": 8.499509263477388e-07, + "loss": 0.1018, + "step": 526 + }, + { + "epoch": 4.62280701754386, + "grad_norm": 0.3112829625606537, + "learning_rate": 8.398716122608868e-07, + "loss": 0.1037, + "step": 527 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 0.9097342491149902, + "learning_rate": 8.298403394661658e-07, + "loss": 0.1015, + "step": 528 + }, + { + "epoch": 4.640350877192983, + "grad_norm": 0.6663810014724731, + "learning_rate": 8.198573982515537e-07, + "loss": 0.1038, + "step": 529 + }, + { + "epoch": 4.649122807017544, + "grad_norm": 1.1880309581756592, + "learning_rate": 8.099230775063879e-07, + "loss": 0.1044, + "step": 530 + }, + { + "epoch": 4.657894736842105, + "grad_norm": 0.6492993831634521, + "learning_rate": 8.000376647130165e-07, + "loss": 0.103, + "step": 531 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.43723204731941223, + "learning_rate": 7.902014459384744e-07, + "loss": 0.1025, + "step": 532 + }, + { + "epoch": 4.675438596491228, + "grad_norm": 0.8422684669494629, + "learning_rate": 7.804147058262015e-07, + "loss": 0.1035, + "step": 533 + }, + { + "epoch": 4.684210526315789, + "grad_norm": 0.6502094268798828, + "learning_rate": 7.706777275878161e-07, + "loss": 0.0994, + "step": 534 + }, + { + "epoch": 4.692982456140351, + "grad_norm": 0.5709391236305237, + "learning_rate": 7.609907929949045e-07, + "loss": 0.1056, + "step": 535 + }, + { + "epoch": 4.701754385964913, + "grad_norm": 0.4126770496368408, + "learning_rate": 7.513541823708828e-07, + "loss": 0.101, + "step": 536 + }, + { + "epoch": 4.7105263157894735, + "grad_norm": 0.5016621947288513, + "learning_rate": 7.417681745828706e-07, + "loss": 0.0999, + "step": 537 + }, + { + "epoch": 4.719298245614035, + "grad_norm": 0.8139487504959106, + "learning_rate": 7.322330470336314e-07, + "loss": 0.0984, + "step": 538 + }, + { + "epoch": 4.728070175438597, + "grad_norm": 0.5805723667144775, + "learning_rate": 7.227490756535396e-07, + "loss": 0.1011, + "step": 539 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 0.7970795631408691, + "learning_rate": 7.133165348925978e-07, + "loss": 0.1016, + "step": 540 + }, + { + "epoch": 4.745614035087719, + "grad_norm": 0.6336880326271057, + "learning_rate": 7.039356977124937e-07, + "loss": 0.1027, + "step": 541 + }, + { + "epoch": 4.754385964912281, + "grad_norm": 0.2953254282474518, + "learning_rate": 6.946068355786992e-07, + "loss": 0.1022, + "step": 542 + }, + { + "epoch": 4.7631578947368425, + "grad_norm": 0.5646472573280334, + "learning_rate": 6.853302184526217e-07, + "loss": 0.0998, + "step": 543 + }, + { + "epoch": 4.771929824561403, + "grad_norm": 0.6545483469963074, + "learning_rate": 6.761061147837808e-07, + "loss": 0.0985, + "step": 544 + }, + { + "epoch": 4.780701754385965, + "grad_norm": 0.8741705417633057, + "learning_rate": 6.669347915020524e-07, + "loss": 0.1006, + "step": 545 + }, + { + "epoch": 4.7894736842105265, + "grad_norm": 0.8579487204551697, + "learning_rate": 6.578165140099318e-07, + "loss": 0.1037, + "step": 546 + }, + { + "epoch": 4.798245614035087, + "grad_norm": 1.0744833946228027, + "learning_rate": 6.487515461748631e-07, + "loss": 0.1017, + "step": 547 + }, + { + "epoch": 4.807017543859649, + "grad_norm": 0.4954414367675781, + "learning_rate": 6.397401503215992e-07, + "loss": 0.1006, + "step": 548 + }, + { + "epoch": 4.815789473684211, + "grad_norm": 0.525191068649292, + "learning_rate": 6.307825872246076e-07, + "loss": 0.1024, + "step": 549 + }, + { + "epoch": 4.824561403508772, + "grad_norm": 0.8922368288040161, + "learning_rate": 6.218791161005336e-07, + "loss": 0.0999, + "step": 550 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.6471604704856873, + "learning_rate": 6.13029994600686e-07, + "loss": 0.0994, + "step": 551 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 0.49826696515083313, + "learning_rate": 6.042354788035943e-07, + "loss": 0.1003, + "step": 552 + }, + { + "epoch": 4.850877192982456, + "grad_norm": 0.7908043265342712, + "learning_rate": 5.954958232075858e-07, + "loss": 0.1003, + "step": 553 + }, + { + "epoch": 4.859649122807017, + "grad_norm": 0.40011560916900635, + "learning_rate": 5.868112807234313e-07, + "loss": 0.0991, + "step": 554 + }, + { + "epoch": 4.868421052631579, + "grad_norm": 0.9797350764274597, + "learning_rate": 5.781821026670203e-07, + "loss": 0.1005, + "step": 555 + }, + { + "epoch": 4.87719298245614, + "grad_norm": 0.4581677317619324, + "learning_rate": 5.696085387520894e-07, + "loss": 0.1013, + "step": 556 + }, + { + "epoch": 4.885964912280702, + "grad_norm": 0.6596454381942749, + "learning_rate": 5.610908370829981e-07, + "loss": 0.1028, + "step": 557 + }, + { + "epoch": 4.894736842105263, + "grad_norm": 0.5106292963027954, + "learning_rate": 5.526292441475448e-07, + "loss": 0.1023, + "step": 558 + }, + { + "epoch": 4.9035087719298245, + "grad_norm": 0.5137461423873901, + "learning_rate": 5.442240048098402e-07, + "loss": 0.1036, + "step": 559 + }, + { + "epoch": 4.912280701754386, + "grad_norm": 0.4619182348251343, + "learning_rate": 5.358753623032137e-07, + "loss": 0.0979, + "step": 560 + }, + { + "epoch": 4.921052631578947, + "grad_norm": 0.5350770354270935, + "learning_rate": 5.275835582231833e-07, + "loss": 0.0992, + "step": 561 + }, + { + "epoch": 4.9298245614035086, + "grad_norm": 0.7599822878837585, + "learning_rate": 5.193488325204551e-07, + "loss": 0.0983, + "step": 562 + }, + { + "epoch": 4.93859649122807, + "grad_norm": 0.47537004947662354, + "learning_rate": 5.111714234939868e-07, + "loss": 0.1004, + "step": 563 + }, + { + "epoch": 4.947368421052632, + "grad_norm": 0.597273588180542, + "learning_rate": 5.030515677840883e-07, + "loss": 0.1015, + "step": 564 + }, + { + "epoch": 4.956140350877193, + "grad_norm": 0.7155528664588928, + "learning_rate": 4.949895003655728e-07, + "loss": 0.1017, + "step": 565 + }, + { + "epoch": 4.964912280701754, + "grad_norm": 0.530358612537384, + "learning_rate": 4.869854545409627e-07, + "loss": 0.0998, + "step": 566 + }, + { + "epoch": 4.973684210526316, + "grad_norm": 0.6721721291542053, + "learning_rate": 4.790396619337286e-07, + "loss": 0.1003, + "step": 567 + }, + { + "epoch": 4.982456140350877, + "grad_norm": 0.8486731648445129, + "learning_rate": 4.711523524815978e-07, + "loss": 0.0996, + "step": 568 + }, + { + "epoch": 4.991228070175438, + "grad_norm": 0.7072808742523193, + "learning_rate": 4.633237544298891e-07, + "loss": 0.1004, + "step": 569 + }, + { + "epoch": 5.0, + "grad_norm": 0.41283953189849854, + "learning_rate": 4.555540943249187e-07, + "loss": 0.1026, + "step": 570 + } + ], + "logging_steps": 1, + "max_steps": 684, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 114, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.72999503707426e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-570/training_args.bin b/checkpoint-570/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..38c27bdabb0e0e68242bce9d9302628a34f6e7cf --- /dev/null +++ b/checkpoint-570/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cb0553c2c3dd5a010aed55eae3afd8bd7f096b43ba03d25af54dc26191426ae +size 7992 diff --git a/checkpoint-570/zero_to_fp32.py b/checkpoint-570/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-570/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-684/README.md b/checkpoint-684/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f4a3934800eeb082a0cb833d7b6af4f68eed3615 --- /dev/null +++ b/checkpoint-684/README.md @@ -0,0 +1,202 @@ +--- +base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1 +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-684/adapter_config.json b/checkpoint-684/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..04e5237df60f7183856cc551f942e0ea492ed0be --- /dev/null +++ b/checkpoint-684/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-684/adapter_model.safetensors b/checkpoint-684/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d071e8337a127c8780a346e6e69c4e2195786154 --- /dev/null +++ b/checkpoint-684/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c08cabaa331365104eda0f955b3bcca40f58f5ba2408e03aedf9cc235c104191 +size 9016826528 diff --git a/checkpoint-684/global_step684/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-684/global_step684/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..149af1d5e1015eab2003622da67a889ac84b9518 --- /dev/null +++ b/checkpoint-684/global_step684/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6e27fa02802a0dd6d25d10cf49d0c1a101925347cf6520883bf0b4d10b9d864 +size 27050164444 diff --git a/checkpoint-684/global_step684/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-684/global_step684/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..11e077895710f8291da3f390a7bbeac47244c64f --- /dev/null +++ b/checkpoint-684/global_step684/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:912be014e27defd74da10296591495afa96b599a5d2c146be81081413e6a81e4 +size 27050169884 diff --git a/checkpoint-684/global_step684/mp_rank_00_model_states.pt b/checkpoint-684/global_step684/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e688a60d4a45499a9a698c47b81b7b14df4c192f --- /dev/null +++ b/checkpoint-684/global_step684/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7f5150b6fe3f1d5ba4817fa4a2a2b4f38d7d0cc97fc70f92b69861b6e3b7371 +size 9776788601 diff --git a/checkpoint-684/latest b/checkpoint-684/latest new file mode 100644 index 0000000000000000000000000000000000000000..32b7f894d10e5e12f7ef9cea66d082aaff9baad6 --- /dev/null +++ b/checkpoint-684/latest @@ -0,0 +1 @@ +global_step684 \ No newline at end of file diff --git a/checkpoint-684/rng_state_0.pth b/checkpoint-684/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..e5def56c514e9207d48ed27325175e02388447eb --- /dev/null +++ b/checkpoint-684/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25cff4a257babccfd8e674add2d01ad4892c537ed897a74d1a9134b1885b4f7f +size 14512 diff --git a/checkpoint-684/rng_state_1.pth b/checkpoint-684/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9bf1d6231fc6e68165bc83edb42b0dd0d3bea65d --- /dev/null +++ b/checkpoint-684/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c864b15610207b2ff2d0d3d92423e5c186888dbcd07fc522ebe0404df39b8118 +size 14512 diff --git a/checkpoint-684/scheduler.pt b/checkpoint-684/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..40438486cd88f508a260f58d9ab24bfa9cf84217 --- /dev/null +++ b/checkpoint-684/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:413a7e9882fa261750972ae9e540d9a20775ad3cb6dc44fdda8e90c61665a5d3 +size 1064 diff --git a/checkpoint-684/special_tokens_map.json b/checkpoint-684/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-684/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-684/tokenizer.json b/checkpoint-684/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-684/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-684/tokenizer_config.json b/checkpoint-684/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74 --- /dev/null +++ b/checkpoint-684/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-684/trainer_state.json b/checkpoint-684/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3dba80755a03b104345d40314f07c9a10d1bbb79 --- /dev/null +++ b/checkpoint-684/trainer_state.json @@ -0,0 +1,4821 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 684, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008771929824561403, + "grad_norm": 39.56407165527344, + "learning_rate": 5.0000000000000004e-08, + "loss": 5.1375, + "step": 1 + }, + { + "epoch": 0.017543859649122806, + "grad_norm": 40.30452346801758, + "learning_rate": 1.0000000000000001e-07, + "loss": 5.1185, + "step": 2 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 40.062313079833984, + "learning_rate": 1.5000000000000002e-07, + "loss": 5.0762, + "step": 3 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 39.17148208618164, + "learning_rate": 2.0000000000000002e-07, + "loss": 5.016, + "step": 4 + }, + { + "epoch": 0.043859649122807015, + "grad_norm": 40.67367172241211, + "learning_rate": 2.5000000000000004e-07, + "loss": 5.0428, + "step": 5 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 38.18095016479492, + "learning_rate": 3.0000000000000004e-07, + "loss": 5.2025, + "step": 6 + }, + { + "epoch": 0.06140350877192982, + "grad_norm": 39.12940979003906, + "learning_rate": 3.5000000000000004e-07, + "loss": 4.9896, + "step": 7 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 38.84568405151367, + "learning_rate": 4.0000000000000003e-07, + "loss": 5.1078, + "step": 8 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 39.38333511352539, + "learning_rate": 4.5000000000000003e-07, + "loss": 5.0808, + "step": 9 + }, + { + "epoch": 0.08771929824561403, + "grad_norm": 39.427650451660156, + "learning_rate": 5.000000000000001e-07, + "loss": 5.0534, + "step": 10 + }, + { + "epoch": 0.09649122807017543, + "grad_norm": 39.29513168334961, + "learning_rate": 5.5e-07, + "loss": 5.058, + "step": 11 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 39.641231536865234, + "learning_rate": 6.000000000000001e-07, + "loss": 5.0317, + "step": 12 + }, + { + "epoch": 0.11403508771929824, + "grad_norm": 37.91259765625, + "learning_rate": 6.5e-07, + "loss": 4.912, + "step": 13 + }, + { + "epoch": 0.12280701754385964, + "grad_norm": 38.203548431396484, + "learning_rate": 7.000000000000001e-07, + "loss": 4.9705, + "step": 14 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 39.15998840332031, + "learning_rate": 7.5e-07, + "loss": 4.6962, + "step": 15 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 37.754669189453125, + "learning_rate": 8.000000000000001e-07, + "loss": 4.6262, + "step": 16 + }, + { + "epoch": 0.14912280701754385, + "grad_norm": 35.871490478515625, + "learning_rate": 8.500000000000001e-07, + "loss": 4.5422, + "step": 17 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 36.16888427734375, + "learning_rate": 9.000000000000001e-07, + "loss": 4.664, + "step": 18 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 33.520118713378906, + "learning_rate": 9.500000000000001e-07, + "loss": 4.4697, + "step": 19 + }, + { + "epoch": 0.17543859649122806, + "grad_norm": 30.896282196044922, + "learning_rate": 1.0000000000000002e-06, + "loss": 4.3568, + "step": 20 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 29.944643020629883, + "learning_rate": 1.0500000000000001e-06, + "loss": 4.2269, + "step": 21 + }, + { + "epoch": 0.19298245614035087, + "grad_norm": 25.224485397338867, + "learning_rate": 1.1e-06, + "loss": 4.1272, + "step": 22 + }, + { + "epoch": 0.20175438596491227, + "grad_norm": 24.410480499267578, + "learning_rate": 1.1500000000000002e-06, + "loss": 4.0585, + "step": 23 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 21.480648040771484, + "learning_rate": 1.2000000000000002e-06, + "loss": 3.9472, + "step": 24 + }, + { + "epoch": 0.21929824561403508, + "grad_norm": 20.61946678161621, + "learning_rate": 1.25e-06, + "loss": 3.8879, + "step": 25 + }, + { + "epoch": 0.22807017543859648, + "grad_norm": 19.578271865844727, + "learning_rate": 1.3e-06, + "loss": 3.6783, + "step": 26 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 17.418983459472656, + "learning_rate": 1.3500000000000002e-06, + "loss": 3.6826, + "step": 27 + }, + { + "epoch": 0.24561403508771928, + "grad_norm": 18.160301208496094, + "learning_rate": 1.4000000000000001e-06, + "loss": 3.478, + "step": 28 + }, + { + "epoch": 0.2543859649122807, + "grad_norm": 17.573204040527344, + "learning_rate": 1.45e-06, + "loss": 3.459, + "step": 29 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 17.1265869140625, + "learning_rate": 1.5e-06, + "loss": 3.3999, + "step": 30 + }, + { + "epoch": 0.2719298245614035, + "grad_norm": 15.527145385742188, + "learning_rate": 1.5500000000000002e-06, + "loss": 3.2817, + "step": 31 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 14.773847579956055, + "learning_rate": 1.6000000000000001e-06, + "loss": 3.234, + "step": 32 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 12.039301872253418, + "learning_rate": 1.6500000000000003e-06, + "loss": 3.132, + "step": 33 + }, + { + "epoch": 0.2982456140350877, + "grad_norm": 9.217979431152344, + "learning_rate": 1.7000000000000002e-06, + "loss": 3.0548, + "step": 34 + }, + { + "epoch": 0.30701754385964913, + "grad_norm": 7.575639724731445, + "learning_rate": 1.75e-06, + "loss": 2.9529, + "step": 35 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 7.496004104614258, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.8967, + "step": 36 + }, + { + "epoch": 0.32456140350877194, + "grad_norm": 7.45414924621582, + "learning_rate": 1.85e-06, + "loss": 2.8837, + "step": 37 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 8.555658340454102, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.7473, + "step": 38 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 10.03805160522461, + "learning_rate": 1.9500000000000004e-06, + "loss": 2.7355, + "step": 39 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 9.30649471282959, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.6587, + "step": 40 + }, + { + "epoch": 0.35964912280701755, + "grad_norm": 8.510339736938477, + "learning_rate": 2.05e-06, + "loss": 2.5977, + "step": 41 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 4.709080696105957, + "learning_rate": 2.1000000000000002e-06, + "loss": 2.6286, + "step": 42 + }, + { + "epoch": 0.37719298245614036, + "grad_norm": 5.128961086273193, + "learning_rate": 2.15e-06, + "loss": 2.4558, + "step": 43 + }, + { + "epoch": 0.38596491228070173, + "grad_norm": 5.190136432647705, + "learning_rate": 2.2e-06, + "loss": 2.4432, + "step": 44 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 4.893551349639893, + "learning_rate": 2.25e-06, + "loss": 2.4939, + "step": 45 + }, + { + "epoch": 0.40350877192982454, + "grad_norm": 5.2434983253479, + "learning_rate": 2.3000000000000004e-06, + "loss": 2.3381, + "step": 46 + }, + { + "epoch": 0.41228070175438597, + "grad_norm": 5.122412204742432, + "learning_rate": 2.35e-06, + "loss": 2.313, + "step": 47 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 4.577274799346924, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.2236, + "step": 48 + }, + { + "epoch": 0.4298245614035088, + "grad_norm": 4.722769737243652, + "learning_rate": 2.4500000000000003e-06, + "loss": 2.1987, + "step": 49 + }, + { + "epoch": 0.43859649122807015, + "grad_norm": 5.059235095977783, + "learning_rate": 2.5e-06, + "loss": 2.1415, + "step": 50 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 4.454439640045166, + "learning_rate": 2.55e-06, + "loss": 2.0466, + "step": 51 + }, + { + "epoch": 0.45614035087719296, + "grad_norm": 4.94586706161499, + "learning_rate": 2.6e-06, + "loss": 1.8762, + "step": 52 + }, + { + "epoch": 0.4649122807017544, + "grad_norm": 4.704402446746826, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.8012, + "step": 53 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 6.125903129577637, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.7669, + "step": 54 + }, + { + "epoch": 0.4824561403508772, + "grad_norm": 4.5356059074401855, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.6607, + "step": 55 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 6.56803035736084, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.6291, + "step": 56 + }, + { + "epoch": 0.5, + "grad_norm": 4.910050392150879, + "learning_rate": 2.85e-06, + "loss": 1.5545, + "step": 57 + }, + { + "epoch": 0.5087719298245614, + "grad_norm": 8.733433723449707, + "learning_rate": 2.9e-06, + "loss": 1.4206, + "step": 58 + }, + { + "epoch": 0.5175438596491229, + "grad_norm": 8.582486152648926, + "learning_rate": 2.95e-06, + "loss": 1.3912, + "step": 59 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 13.710689544677734, + "learning_rate": 3e-06, + "loss": 1.3297, + "step": 60 + }, + { + "epoch": 0.5350877192982456, + "grad_norm": 23.400312423706055, + "learning_rate": 3.05e-06, + "loss": 1.296, + "step": 61 + }, + { + "epoch": 0.543859649122807, + "grad_norm": 5.678805351257324, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.2259, + "step": 62 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 14.700899124145508, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.1087, + "step": 63 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 19.38919448852539, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.1805, + "step": 64 + }, + { + "epoch": 0.5701754385964912, + "grad_norm": 8.460039138793945, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.0963, + "step": 65 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 13.371014595031738, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.0627, + "step": 66 + }, + { + "epoch": 0.5877192982456141, + "grad_norm": 22.380569458007812, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0869, + "step": 67 + }, + { + "epoch": 0.5964912280701754, + "grad_norm": 5.780513286590576, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9991, + "step": 68 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 19.850841522216797, + "learning_rate": 3.45e-06, + "loss": 0.9683, + "step": 69 + }, + { + "epoch": 0.6140350877192983, + "grad_norm": 17.160703659057617, + "learning_rate": 3.5e-06, + "loss": 0.845, + "step": 70 + }, + { + "epoch": 0.6228070175438597, + "grad_norm": 14.264311790466309, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8059, + "step": 71 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 26.39459991455078, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.85, + "step": 72 + }, + { + "epoch": 0.6403508771929824, + "grad_norm": 51.10348892211914, + "learning_rate": 3.65e-06, + "loss": 0.9755, + "step": 73 + }, + { + "epoch": 0.6491228070175439, + "grad_norm": 28.795856475830078, + "learning_rate": 3.7e-06, + "loss": 0.8966, + "step": 74 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 4.6617937088012695, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7716, + "step": 75 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 15.729666709899902, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.7578, + "step": 76 + }, + { + "epoch": 0.6754385964912281, + "grad_norm": 7.109970569610596, + "learning_rate": 3.85e-06, + "loss": 0.7055, + "step": 77 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 20.84659194946289, + "learning_rate": 3.900000000000001e-06, + "loss": 0.7458, + "step": 78 + }, + { + "epoch": 0.6929824561403509, + "grad_norm": 21.601303100585938, + "learning_rate": 3.95e-06, + "loss": 0.6879, + "step": 79 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 3.6914751529693604, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6179, + "step": 80 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 16.539325714111328, + "learning_rate": 4.05e-06, + "loss": 0.5716, + "step": 81 + }, + { + "epoch": 0.7192982456140351, + "grad_norm": 13.931925773620605, + "learning_rate": 4.1e-06, + "loss": 0.558, + "step": 82 + }, + { + "epoch": 0.7280701754385965, + "grad_norm": 10.52951717376709, + "learning_rate": 4.15e-06, + "loss": 0.6018, + "step": 83 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 17.337060928344727, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.5501, + "step": 84 + }, + { + "epoch": 0.7456140350877193, + "grad_norm": 13.500468254089355, + "learning_rate": 4.25e-06, + "loss": 0.5214, + "step": 85 + }, + { + "epoch": 0.7543859649122807, + "grad_norm": 10.290645599365234, + "learning_rate": 4.3e-06, + "loss": 0.4996, + "step": 86 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 9.757556915283203, + "learning_rate": 4.350000000000001e-06, + "loss": 0.498, + "step": 87 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 9.325140953063965, + "learning_rate": 4.4e-06, + "loss": 0.4721, + "step": 88 + }, + { + "epoch": 0.7807017543859649, + "grad_norm": 2.9322128295898438, + "learning_rate": 4.450000000000001e-06, + "loss": 0.4528, + "step": 89 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 10.484073638916016, + "learning_rate": 4.5e-06, + "loss": 0.445, + "step": 90 + }, + { + "epoch": 0.7982456140350878, + "grad_norm": 32.7827262878418, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.5105, + "step": 91 + }, + { + "epoch": 0.8070175438596491, + "grad_norm": 2.8477306365966797, + "learning_rate": 4.600000000000001e-06, + "loss": 0.4117, + "step": 92 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 2.7680225372314453, + "learning_rate": 4.65e-06, + "loss": 0.3653, + "step": 93 + }, + { + "epoch": 0.8245614035087719, + "grad_norm": 2.6512742042541504, + "learning_rate": 4.7e-06, + "loss": 0.3878, + "step": 94 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 6.453914165496826, + "learning_rate": 4.75e-06, + "loss": 0.3611, + "step": 95 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 3.4594080448150635, + "learning_rate": 4.800000000000001e-06, + "loss": 0.3817, + "step": 96 + }, + { + "epoch": 0.8508771929824561, + "grad_norm": 3.6144917011260986, + "learning_rate": 4.85e-06, + "loss": 0.3618, + "step": 97 + }, + { + "epoch": 0.8596491228070176, + "grad_norm": 5.349407196044922, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.3218, + "step": 98 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 13.671236991882324, + "learning_rate": 4.95e-06, + "loss": 0.3329, + "step": 99 + }, + { + "epoch": 0.8771929824561403, + "grad_norm": 5.84046745300293, + "learning_rate": 5e-06, + "loss": 0.2967, + "step": 100 + }, + { + "epoch": 0.8859649122807017, + "grad_norm": 14.005338668823242, + "learning_rate": 4.999963827125897e-06, + "loss": 0.303, + "step": 101 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 9.18114185333252, + "learning_rate": 4.999855309550366e-06, + "loss": 0.2762, + "step": 102 + }, + { + "epoch": 0.9035087719298246, + "grad_norm": 3.0800487995147705, + "learning_rate": 4.999674450413725e-06, + "loss": 0.2628, + "step": 103 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 82.03578186035156, + "learning_rate": 4.999421254949728e-06, + "loss": 0.4065, + "step": 104 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 77.66315460205078, + "learning_rate": 4.99909573048542e-06, + "loss": 0.4307, + "step": 105 + }, + { + "epoch": 0.9298245614035088, + "grad_norm": 18.28767967224121, + "learning_rate": 4.998697886440927e-06, + "loss": 0.2571, + "step": 106 + }, + { + "epoch": 0.9385964912280702, + "grad_norm": 5.960445880889893, + "learning_rate": 4.998227734329177e-06, + "loss": 0.2847, + "step": 107 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 5.437699794769287, + "learning_rate": 4.9976852877555755e-06, + "loss": 0.2728, + "step": 108 + }, + { + "epoch": 0.956140350877193, + "grad_norm": 3.379631280899048, + "learning_rate": 4.997070562417602e-06, + "loss": 0.2467, + "step": 109 + }, + { + "epoch": 0.9649122807017544, + "grad_norm": 3.1625075340270996, + "learning_rate": 4.996383576104362e-06, + "loss": 0.2273, + "step": 110 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 15.588600158691406, + "learning_rate": 4.995624348696071e-06, + "loss": 0.2486, + "step": 111 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 2.631044387817383, + "learning_rate": 4.9947929021634815e-06, + "loss": 0.1964, + "step": 112 + }, + { + "epoch": 0.9912280701754386, + "grad_norm": 4.706504821777344, + "learning_rate": 4.993889260567239e-06, + "loss": 0.1901, + "step": 113 + }, + { + "epoch": 1.0, + "grad_norm": 10.368465423583984, + "learning_rate": 4.9929134500571954e-06, + "loss": 0.1996, + "step": 114 + }, + { + "epoch": 1.0087719298245614, + "grad_norm": 30.44986343383789, + "learning_rate": 4.991865498871647e-06, + "loss": 0.2606, + "step": 115 + }, + { + "epoch": 1.0175438596491229, + "grad_norm": 14.421515464782715, + "learning_rate": 4.99074543733652e-06, + "loss": 0.2394, + "step": 116 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 14.072005271911621, + "learning_rate": 4.989553297864489e-06, + "loss": 0.2288, + "step": 117 + }, + { + "epoch": 1.0350877192982457, + "grad_norm": 4.395325660705566, + "learning_rate": 4.988289114954045e-06, + "loss": 0.2129, + "step": 118 + }, + { + "epoch": 1.043859649122807, + "grad_norm": 7.286703586578369, + "learning_rate": 4.986952925188489e-06, + "loss": 0.186, + "step": 119 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 8.332784652709961, + "learning_rate": 4.98554476723488e-06, + "loss": 0.178, + "step": 120 + }, + { + "epoch": 1.0614035087719298, + "grad_norm": 1.3646447658538818, + "learning_rate": 4.984064681842917e-06, + "loss": 0.1687, + "step": 121 + }, + { + "epoch": 1.0701754385964912, + "grad_norm": 4.494940757751465, + "learning_rate": 4.982512711843753e-06, + "loss": 0.1881, + "step": 122 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 3.3929836750030518, + "learning_rate": 4.980888902148757e-06, + "loss": 0.1764, + "step": 123 + }, + { + "epoch": 1.087719298245614, + "grad_norm": 1.8281155824661255, + "learning_rate": 4.979193299748225e-06, + "loss": 0.1602, + "step": 124 + }, + { + "epoch": 1.0964912280701755, + "grad_norm": 3.494239568710327, + "learning_rate": 4.977425953710005e-06, + "loss": 0.1729, + "step": 125 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 1.500410556793213, + "learning_rate": 4.975586915178084e-06, + "loss": 0.1666, + "step": 126 + }, + { + "epoch": 1.1140350877192982, + "grad_norm": 1.4680222272872925, + "learning_rate": 4.973676237371111e-06, + "loss": 0.159, + "step": 127 + }, + { + "epoch": 1.1228070175438596, + "grad_norm": 3.0383460521698, + "learning_rate": 4.971693975580851e-06, + "loss": 0.1484, + "step": 128 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 3.74821138381958, + "learning_rate": 4.969640187170591e-06, + "loss": 0.1586, + "step": 129 + }, + { + "epoch": 1.1403508771929824, + "grad_norm": 4.682602405548096, + "learning_rate": 4.967514931573473e-06, + "loss": 0.1619, + "step": 130 + }, + { + "epoch": 1.1491228070175439, + "grad_norm": 3.90673565864563, + "learning_rate": 4.965318270290779e-06, + "loss": 0.164, + "step": 131 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 2.2017388343811035, + "learning_rate": 4.963050266890152e-06, + "loss": 0.1499, + "step": 132 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 2.4211816787719727, + "learning_rate": 4.960710987003753e-06, + "loss": 0.1387, + "step": 133 + }, + { + "epoch": 1.1754385964912282, + "grad_norm": 1.7753759622573853, + "learning_rate": 4.958300498326363e-06, + "loss": 0.1441, + "step": 134 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 1.5529910326004028, + "learning_rate": 4.955818870613425e-06, + "loss": 0.1304, + "step": 135 + }, + { + "epoch": 1.1929824561403508, + "grad_norm": 2.090593099594116, + "learning_rate": 4.953266175679023e-06, + "loss": 0.1419, + "step": 136 + }, + { + "epoch": 1.2017543859649122, + "grad_norm": 2.7141878604888916, + "learning_rate": 4.95064248739381e-06, + "loss": 0.1444, + "step": 137 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 2.3690481185913086, + "learning_rate": 4.947947881682861e-06, + "loss": 0.1383, + "step": 138 + }, + { + "epoch": 1.219298245614035, + "grad_norm": 2.2403147220611572, + "learning_rate": 4.945182436523482e-06, + "loss": 0.1418, + "step": 139 + }, + { + "epoch": 1.2280701754385965, + "grad_norm": 1.3939160108566284, + "learning_rate": 4.942346231942955e-06, + "loss": 0.1307, + "step": 140 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 11.276732444763184, + "learning_rate": 4.939439350016214e-06, + "loss": 0.1397, + "step": 141 + }, + { + "epoch": 1.2456140350877192, + "grad_norm": 8.260516166687012, + "learning_rate": 4.9364618748634794e-06, + "loss": 0.1426, + "step": 142 + }, + { + "epoch": 1.2543859649122808, + "grad_norm": 2.09720516204834, + "learning_rate": 4.933413892647819e-06, + "loss": 0.1323, + "step": 143 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.802125334739685, + "learning_rate": 4.9302954915726535e-06, + "loss": 0.1304, + "step": 144 + }, + { + "epoch": 1.2719298245614035, + "grad_norm": 1.7151471376419067, + "learning_rate": 4.927106761879207e-06, + "loss": 0.1264, + "step": 145 + }, + { + "epoch": 1.280701754385965, + "grad_norm": 1.6970336437225342, + "learning_rate": 4.923847795843894e-06, + "loss": 0.1227, + "step": 146 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 16.60441017150879, + "learning_rate": 4.920518687775647e-06, + "loss": 0.1606, + "step": 147 + }, + { + "epoch": 1.2982456140350878, + "grad_norm": 6.470354080200195, + "learning_rate": 4.917119534013194e-06, + "loss": 0.1447, + "step": 148 + }, + { + "epoch": 1.3070175438596492, + "grad_norm": 1.4908231496810913, + "learning_rate": 4.913650432922264e-06, + "loss": 0.1343, + "step": 149 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 3.19964861869812, + "learning_rate": 4.91011148489274e-06, + "loss": 0.1354, + "step": 150 + }, + { + "epoch": 1.3245614035087718, + "grad_norm": 2.6052839756011963, + "learning_rate": 4.906502792335761e-06, + "loss": 0.1342, + "step": 151 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.0719165802001953, + "learning_rate": 4.9028244596807525e-06, + "loss": 0.1359, + "step": 152 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.8086919784545898, + "learning_rate": 4.899076593372405e-06, + "loss": 0.1279, + "step": 153 + }, + { + "epoch": 1.3508771929824561, + "grad_norm": 1.0056848526000977, + "learning_rate": 4.8952593018675955e-06, + "loss": 0.1162, + "step": 154 + }, + { + "epoch": 1.3596491228070176, + "grad_norm": 5.72553014755249, + "learning_rate": 4.891372695632249e-06, + "loss": 0.1315, + "step": 155 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.522894024848938, + "learning_rate": 4.887416887138139e-06, + "loss": 0.1266, + "step": 156 + }, + { + "epoch": 1.3771929824561404, + "grad_norm": 2.019472122192383, + "learning_rate": 4.883391990859635e-06, + "loss": 0.1262, + "step": 157 + }, + { + "epoch": 1.3859649122807016, + "grad_norm": 1.8594422340393066, + "learning_rate": 4.879298123270391e-06, + "loss": 0.125, + "step": 158 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 1.365377426147461, + "learning_rate": 4.8751354028399725e-06, + "loss": 0.1218, + "step": 159 + }, + { + "epoch": 1.4035087719298245, + "grad_norm": 3.553309917449951, + "learning_rate": 4.870903950030429e-06, + "loss": 0.1272, + "step": 160 + }, + { + "epoch": 1.412280701754386, + "grad_norm": 2.1770920753479004, + "learning_rate": 4.866603887292809e-06, + "loss": 0.1213, + "step": 161 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 1.6058955192565918, + "learning_rate": 4.862235339063613e-06, + "loss": 0.1173, + "step": 162 + }, + { + "epoch": 1.4298245614035088, + "grad_norm": 1.3208314180374146, + "learning_rate": 4.857798431761199e-06, + "loss": 0.1183, + "step": 163 + }, + { + "epoch": 1.4385964912280702, + "grad_norm": 1.282729983329773, + "learning_rate": 4.853293293782118e-06, + "loss": 0.1209, + "step": 164 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 1.3838152885437012, + "learning_rate": 4.848720055497401e-06, + "loss": 0.1198, + "step": 165 + }, + { + "epoch": 1.456140350877193, + "grad_norm": 1.2930737733840942, + "learning_rate": 4.844078849248785e-06, + "loss": 0.1268, + "step": 166 + }, + { + "epoch": 1.4649122807017543, + "grad_norm": 1.7022266387939453, + "learning_rate": 4.839369809344888e-06, + "loss": 0.1198, + "step": 167 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.0927815437316895, + "learning_rate": 4.834593072057313e-06, + "loss": 0.1132, + "step": 168 + }, + { + "epoch": 1.4824561403508771, + "grad_norm": 0.9326333999633789, + "learning_rate": 4.829748775616716e-06, + "loss": 0.1193, + "step": 169 + }, + { + "epoch": 1.4912280701754386, + "grad_norm": 1.3564742803573608, + "learning_rate": 4.8248370602087954e-06, + "loss": 0.118, + "step": 170 + }, + { + "epoch": 1.5, + "grad_norm": 1.19778573513031, + "learning_rate": 4.819858067970243e-06, + "loss": 0.1122, + "step": 171 + }, + { + "epoch": 1.5087719298245614, + "grad_norm": 2.8438351154327393, + "learning_rate": 4.814811942984625e-06, + "loss": 0.1217, + "step": 172 + }, + { + "epoch": 1.5175438596491229, + "grad_norm": 1.0701063871383667, + "learning_rate": 4.809698831278217e-06, + "loss": 0.1114, + "step": 173 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.9053553938865662, + "learning_rate": 4.804518880815776e-06, + "loss": 0.1178, + "step": 174 + }, + { + "epoch": 1.5350877192982457, + "grad_norm": 0.42274603247642517, + "learning_rate": 4.799272241496259e-06, + "loss": 0.1091, + "step": 175 + }, + { + "epoch": 1.543859649122807, + "grad_norm": 0.8576470017433167, + "learning_rate": 4.793959065148484e-06, + "loss": 0.1134, + "step": 176 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.5910662412643433, + "learning_rate": 4.78857950552674e-06, + "loss": 0.1148, + "step": 177 + }, + { + "epoch": 1.5614035087719298, + "grad_norm": 0.8761632442474365, + "learning_rate": 4.783133718306331e-06, + "loss": 0.1125, + "step": 178 + }, + { + "epoch": 1.5701754385964912, + "grad_norm": 1.9190795421600342, + "learning_rate": 4.777621861079079e-06, + "loss": 0.1148, + "step": 179 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.6199957728385925, + "learning_rate": 4.772044093348757e-06, + "loss": 0.1097, + "step": 180 + }, + { + "epoch": 1.587719298245614, + "grad_norm": 1.562089443206787, + "learning_rate": 4.766400576526479e-06, + "loss": 0.1097, + "step": 181 + }, + { + "epoch": 1.5964912280701755, + "grad_norm": 1.4957091808319092, + "learning_rate": 4.760691473926021e-06, + "loss": 0.1216, + "step": 182 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.9863570332527161, + "learning_rate": 4.754916950759105e-06, + "loss": 0.1122, + "step": 183 + }, + { + "epoch": 1.6140350877192984, + "grad_norm": 0.5803346633911133, + "learning_rate": 4.749077174130609e-06, + "loss": 0.1103, + "step": 184 + }, + { + "epoch": 1.6228070175438596, + "grad_norm": 1.8789891004562378, + "learning_rate": 4.743172313033738e-06, + "loss": 0.1191, + "step": 185 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.8731380105018616, + "learning_rate": 4.7372025383451285e-06, + "loss": 0.1154, + "step": 186 + }, + { + "epoch": 1.6403508771929824, + "grad_norm": 1.3535627126693726, + "learning_rate": 4.7311680228199075e-06, + "loss": 0.1123, + "step": 187 + }, + { + "epoch": 1.6491228070175439, + "grad_norm": 0.7211089134216309, + "learning_rate": 4.725068941086693e-06, + "loss": 0.1134, + "step": 188 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 1.4752328395843506, + "learning_rate": 4.718905469642534e-06, + "loss": 0.1185, + "step": 189 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.9822680354118347, + "learning_rate": 4.712677786847814e-06, + "loss": 0.1146, + "step": 190 + }, + { + "epoch": 1.6754385964912282, + "grad_norm": 1.1308330297470093, + "learning_rate": 4.706386072921083e-06, + "loss": 0.1061, + "step": 191 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 5.331939697265625, + "learning_rate": 4.70003050993384e-06, + "loss": 0.1153, + "step": 192 + }, + { + "epoch": 1.692982456140351, + "grad_norm": 0.6911673545837402, + "learning_rate": 4.6936112818052674e-06, + "loss": 0.1098, + "step": 193 + }, + { + "epoch": 1.7017543859649122, + "grad_norm": 0.5160980224609375, + "learning_rate": 4.687128574296912e-06, + "loss": 0.1073, + "step": 194 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 1.5724798440933228, + "learning_rate": 4.680582575007303e-06, + "loss": 0.121, + "step": 195 + }, + { + "epoch": 1.719298245614035, + "grad_norm": 1.3960011005401611, + "learning_rate": 4.6739734733665275e-06, + "loss": 0.1145, + "step": 196 + }, + { + "epoch": 1.7280701754385965, + "grad_norm": 1.4949183464050293, + "learning_rate": 4.6673014606307465e-06, + "loss": 0.1166, + "step": 197 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 1.6873422861099243, + "learning_rate": 4.660566729876661e-06, + "loss": 0.1115, + "step": 198 + }, + { + "epoch": 1.7456140350877192, + "grad_norm": 1.3443641662597656, + "learning_rate": 4.653769475995926e-06, + "loss": 0.1119, + "step": 199 + }, + { + "epoch": 1.7543859649122808, + "grad_norm": 0.807525098323822, + "learning_rate": 4.646909895689508e-06, + "loss": 0.1059, + "step": 200 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 1.589316964149475, + "learning_rate": 4.639988187461995e-06, + "loss": 0.1151, + "step": 201 + }, + { + "epoch": 1.7719298245614035, + "grad_norm": 2.474756956100464, + "learning_rate": 4.633004551615851e-06, + "loss": 0.116, + "step": 202 + }, + { + "epoch": 1.780701754385965, + "grad_norm": 0.6210195422172546, + "learning_rate": 4.62595919024562e-06, + "loss": 0.1097, + "step": 203 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.7217905521392822, + "learning_rate": 4.618852307232078e-06, + "loss": 0.1117, + "step": 204 + }, + { + "epoch": 1.7982456140350878, + "grad_norm": 1.551251769065857, + "learning_rate": 4.611684108236334e-06, + "loss": 0.113, + "step": 205 + }, + { + "epoch": 1.807017543859649, + "grad_norm": 0.6619828939437866, + "learning_rate": 4.604454800693874e-06, + "loss": 0.113, + "step": 206 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.9461805820465088, + "learning_rate": 4.597164593808564e-06, + "loss": 0.1093, + "step": 207 + }, + { + "epoch": 1.8245614035087718, + "grad_norm": 1.2926547527313232, + "learning_rate": 4.589813698546592e-06, + "loss": 0.1128, + "step": 208 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.8754212856292725, + "learning_rate": 4.582402327630368e-06, + "loss": 0.1104, + "step": 209 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.846051812171936, + "learning_rate": 4.574930695532357e-06, + "loss": 0.1105, + "step": 210 + }, + { + "epoch": 1.8508771929824561, + "grad_norm": 1.3332515954971313, + "learning_rate": 4.567399018468889e-06, + "loss": 0.1101, + "step": 211 + }, + { + "epoch": 1.8596491228070176, + "grad_norm": 0.8729192614555359, + "learning_rate": 4.5598075143938855e-06, + "loss": 0.1081, + "step": 212 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.8618345260620117, + "learning_rate": 4.552156402992567e-06, + "loss": 0.1059, + "step": 213 + }, + { + "epoch": 1.8771929824561404, + "grad_norm": 1.2135930061340332, + "learning_rate": 4.544445905675082e-06, + "loss": 0.1105, + "step": 214 + }, + { + "epoch": 1.8859649122807016, + "grad_norm": 0.8405666351318359, + "learning_rate": 4.536676245570111e-06, + "loss": 0.1118, + "step": 215 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.42860639095306396, + "learning_rate": 4.528847647518403e-06, + "loss": 0.1093, + "step": 216 + }, + { + "epoch": 1.9035087719298245, + "grad_norm": 1.1538206338882446, + "learning_rate": 4.520960338066271e-06, + "loss": 0.1088, + "step": 217 + }, + { + "epoch": 1.912280701754386, + "grad_norm": 0.5870749354362488, + "learning_rate": 4.513014545459038e-06, + "loss": 0.1061, + "step": 218 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.7279748916625977, + "learning_rate": 4.505010499634427e-06, + "loss": 0.1032, + "step": 219 + }, + { + "epoch": 1.9298245614035088, + "grad_norm": 0.6331414580345154, + "learning_rate": 4.4969484322159125e-06, + "loss": 0.1109, + "step": 220 + }, + { + "epoch": 1.9385964912280702, + "grad_norm": 0.9024543166160583, + "learning_rate": 4.488828576506014e-06, + "loss": 0.1094, + "step": 221 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 3.540376901626587, + "learning_rate": 4.480651167479545e-06, + "loss": 0.1154, + "step": 222 + }, + { + "epoch": 1.956140350877193, + "grad_norm": 0.9506739377975464, + "learning_rate": 4.472416441776817e-06, + "loss": 0.108, + "step": 223 + }, + { + "epoch": 1.9649122807017543, + "grad_norm": 0.6585081815719604, + "learning_rate": 4.464124637696786e-06, + "loss": 0.1033, + "step": 224 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 1.143038034439087, + "learning_rate": 4.455775995190161e-06, + "loss": 0.1092, + "step": 225 + }, + { + "epoch": 1.9824561403508771, + "grad_norm": 1.148261547088623, + "learning_rate": 4.4473707558524555e-06, + "loss": 0.1076, + "step": 226 + }, + { + "epoch": 1.9912280701754386, + "grad_norm": 0.7375811338424683, + "learning_rate": 4.438909162917003e-06, + "loss": 0.108, + "step": 227 + }, + { + "epoch": 2.0, + "grad_norm": 0.5254591703414917, + "learning_rate": 4.430391461247911e-06, + "loss": 0.1079, + "step": 228 + }, + { + "epoch": 2.008771929824561, + "grad_norm": 1.0198495388031006, + "learning_rate": 4.42181789733298e-06, + "loss": 0.1083, + "step": 229 + }, + { + "epoch": 2.017543859649123, + "grad_norm": 0.9234157800674438, + "learning_rate": 4.413188719276569e-06, + "loss": 0.1084, + "step": 230 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 0.5215068459510803, + "learning_rate": 4.404504176792414e-06, + "loss": 0.1067, + "step": 231 + }, + { + "epoch": 2.0350877192982457, + "grad_norm": 0.9296736121177673, + "learning_rate": 4.3957645211964065e-06, + "loss": 0.1066, + "step": 232 + }, + { + "epoch": 2.043859649122807, + "grad_norm": 0.8660671710968018, + "learning_rate": 4.386970005399314e-06, + "loss": 0.108, + "step": 233 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 0.6014883518218994, + "learning_rate": 4.378120883899467e-06, + "loss": 0.1068, + "step": 234 + }, + { + "epoch": 2.06140350877193, + "grad_norm": 0.6370371580123901, + "learning_rate": 4.369217412775393e-06, + "loss": 0.1076, + "step": 235 + }, + { + "epoch": 2.0701754385964914, + "grad_norm": 0.9806828498840332, + "learning_rate": 4.360259849678402e-06, + "loss": 0.1071, + "step": 236 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 0.6093440651893616, + "learning_rate": 4.351248453825137e-06, + "loss": 0.1038, + "step": 237 + }, + { + "epoch": 2.087719298245614, + "grad_norm": 1.3494842052459717, + "learning_rate": 4.3421834859900695e-06, + "loss": 0.1105, + "step": 238 + }, + { + "epoch": 2.0964912280701755, + "grad_norm": 0.7621576189994812, + "learning_rate": 4.333065208497949e-06, + "loss": 0.1048, + "step": 239 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.5918282866477966, + "learning_rate": 4.3238938852162195e-06, + "loss": 0.1086, + "step": 240 + }, + { + "epoch": 2.1140350877192984, + "grad_norm": 0.7048676609992981, + "learning_rate": 4.314669781547379e-06, + "loss": 0.1061, + "step": 241 + }, + { + "epoch": 2.1228070175438596, + "grad_norm": 1.0750821828842163, + "learning_rate": 4.305393164421301e-06, + "loss": 0.1082, + "step": 242 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 0.6171414852142334, + "learning_rate": 4.296064302287507e-06, + "loss": 0.1039, + "step": 243 + }, + { + "epoch": 2.1403508771929824, + "grad_norm": 0.8080905079841614, + "learning_rate": 4.286683465107403e-06, + "loss": 0.1069, + "step": 244 + }, + { + "epoch": 2.1491228070175437, + "grad_norm": 0.5281466245651245, + "learning_rate": 4.277250924346461e-06, + "loss": 0.1069, + "step": 245 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.8070254325866699, + "learning_rate": 4.267766952966369e-06, + "loss": 0.1061, + "step": 246 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.8560577630996704, + "learning_rate": 4.25823182541713e-06, + "loss": 0.1116, + "step": 247 + }, + { + "epoch": 2.175438596491228, + "grad_norm": 0.7772330045700073, + "learning_rate": 4.2486458176291176e-06, + "loss": 0.1092, + "step": 248 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 0.814601719379425, + "learning_rate": 4.239009207005096e-06, + "loss": 0.1093, + "step": 249 + }, + { + "epoch": 2.192982456140351, + "grad_norm": 0.957789957523346, + "learning_rate": 4.2293222724121855e-06, + "loss": 0.1075, + "step": 250 + }, + { + "epoch": 2.2017543859649122, + "grad_norm": 0.500062108039856, + "learning_rate": 4.219585294173799e-06, + "loss": 0.1048, + "step": 251 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.3866419792175293, + "learning_rate": 4.209798554061527e-06, + "loss": 0.1074, + "step": 252 + }, + { + "epoch": 2.219298245614035, + "grad_norm": 1.1853291988372803, + "learning_rate": 4.199962335286985e-06, + "loss": 0.1076, + "step": 253 + }, + { + "epoch": 2.2280701754385963, + "grad_norm": 0.36602887511253357, + "learning_rate": 4.1900769224936125e-06, + "loss": 0.108, + "step": 254 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.2530711889266968, + "learning_rate": 4.180142601748447e-06, + "loss": 0.1041, + "step": 255 + }, + { + "epoch": 2.245614035087719, + "grad_norm": 1.3067054748535156, + "learning_rate": 4.170159660533834e-06, + "loss": 0.1087, + "step": 256 + }, + { + "epoch": 2.254385964912281, + "grad_norm": 0.3442043960094452, + "learning_rate": 4.160128387739114e-06, + "loss": 0.1099, + "step": 257 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 1.174796462059021, + "learning_rate": 4.150049073652262e-06, + "loss": 0.1063, + "step": 258 + }, + { + "epoch": 2.2719298245614037, + "grad_norm": 0.5719411969184875, + "learning_rate": 4.1399220099514845e-06, + "loss": 0.1043, + "step": 259 + }, + { + "epoch": 2.280701754385965, + "grad_norm": 0.7268956303596497, + "learning_rate": 4.129747489696781e-06, + "loss": 0.1038, + "step": 260 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 0.7028316259384155, + "learning_rate": 4.119525807321467e-06, + "loss": 0.1052, + "step": 261 + }, + { + "epoch": 2.2982456140350878, + "grad_norm": 1.015335202217102, + "learning_rate": 4.109257258623644e-06, + "loss": 0.1116, + "step": 262 + }, + { + "epoch": 2.307017543859649, + "grad_norm": 0.7141755819320679, + "learning_rate": 4.098942140757646e-06, + "loss": 0.108, + "step": 263 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.7656403183937073, + "learning_rate": 4.0885807522254435e-06, + "loss": 0.1043, + "step": 264 + }, + { + "epoch": 2.324561403508772, + "grad_norm": 0.43293774127960205, + "learning_rate": 4.078173392867998e-06, + "loss": 0.1048, + "step": 265 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.6755763292312622, + "learning_rate": 4.0677203638565895e-06, + "loss": 0.1064, + "step": 266 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 0.9648827314376831, + "learning_rate": 4.0572219676841e-06, + "loss": 0.1088, + "step": 267 + }, + { + "epoch": 2.3508771929824563, + "grad_norm": 0.32724836468696594, + "learning_rate": 4.046678508156259e-06, + "loss": 0.1077, + "step": 268 + }, + { + "epoch": 2.3596491228070176, + "grad_norm": 0.4696657061576843, + "learning_rate": 4.036090290382855e-06, + "loss": 0.1067, + "step": 269 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.33901306986808777, + "learning_rate": 4.025457620768901e-06, + "loss": 0.105, + "step": 270 + }, + { + "epoch": 2.3771929824561404, + "grad_norm": 0.5703794360160828, + "learning_rate": 4.014780807005775e-06, + "loss": 0.1033, + "step": 271 + }, + { + "epoch": 2.3859649122807016, + "grad_norm": 0.9639355540275574, + "learning_rate": 4.004060158062306e-06, + "loss": 0.1041, + "step": 272 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 0.8851558566093445, + "learning_rate": 3.993295984175845e-06, + "loss": 0.1064, + "step": 273 + }, + { + "epoch": 2.4035087719298245, + "grad_norm": 0.5200062990188599, + "learning_rate": 3.982488596843276e-06, + "loss": 0.1056, + "step": 274 + }, + { + "epoch": 2.412280701754386, + "grad_norm": 1.160823106765747, + "learning_rate": 3.971638308812007e-06, + "loss": 0.1069, + "step": 275 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 1.0191210508346558, + "learning_rate": 3.9607454340709215e-06, + "loss": 0.1042, + "step": 276 + }, + { + "epoch": 2.4298245614035086, + "grad_norm": 0.37181487679481506, + "learning_rate": 3.949810287841289e-06, + "loss": 0.1062, + "step": 277 + }, + { + "epoch": 2.43859649122807, + "grad_norm": 0.9328593611717224, + "learning_rate": 3.9388331865676436e-06, + "loss": 0.1086, + "step": 278 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 0.8024734258651733, + "learning_rate": 3.927814447908625e-06, + "loss": 0.1051, + "step": 279 + }, + { + "epoch": 2.456140350877193, + "grad_norm": 0.9746696352958679, + "learning_rate": 3.916754390727795e-06, + "loss": 0.1041, + "step": 280 + }, + { + "epoch": 2.4649122807017543, + "grad_norm": 0.5457844138145447, + "learning_rate": 3.905653335084394e-06, + "loss": 0.1052, + "step": 281 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 1.0736924409866333, + "learning_rate": 3.8945116022240945e-06, + "loss": 0.1075, + "step": 282 + }, + { + "epoch": 2.482456140350877, + "grad_norm": 0.6335628032684326, + "learning_rate": 3.8833295145696964e-06, + "loss": 0.1036, + "step": 283 + }, + { + "epoch": 2.4912280701754383, + "grad_norm": 0.6909618377685547, + "learning_rate": 3.872107395711799e-06, + "loss": 0.1089, + "step": 284 + }, + { + "epoch": 2.5, + "grad_norm": 2.1871702671051025, + "learning_rate": 3.860845570399435e-06, + "loss": 0.1066, + "step": 285 + }, + { + "epoch": 2.5087719298245617, + "grad_norm": 0.5831722617149353, + "learning_rate": 3.849544364530678e-06, + "loss": 0.1055, + "step": 286 + }, + { + "epoch": 2.517543859649123, + "grad_norm": 0.5302637815475464, + "learning_rate": 3.838204105143204e-06, + "loss": 0.1057, + "step": 287 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.6348035931587219, + "learning_rate": 3.8268251204048335e-06, + "loss": 0.1089, + "step": 288 + }, + { + "epoch": 2.5350877192982457, + "grad_norm": 2.1932008266448975, + "learning_rate": 3.815407739604033e-06, + "loss": 0.1043, + "step": 289 + }, + { + "epoch": 2.543859649122807, + "grad_norm": 0.4388940930366516, + "learning_rate": 3.803952293140385e-06, + "loss": 0.1055, + "step": 290 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 0.6853339076042175, + "learning_rate": 3.7924591125150265e-06, + "loss": 0.1036, + "step": 291 + }, + { + "epoch": 2.56140350877193, + "grad_norm": 0.34744876623153687, + "learning_rate": 3.78092853032106e-06, + "loss": 0.1025, + "step": 292 + }, + { + "epoch": 2.5701754385964914, + "grad_norm": 0.9523847699165344, + "learning_rate": 3.769360880233922e-06, + "loss": 0.1067, + "step": 293 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 1.303745985031128, + "learning_rate": 3.7577564970017338e-06, + "loss": 0.1082, + "step": 294 + }, + { + "epoch": 2.587719298245614, + "grad_norm": 0.9468981623649597, + "learning_rate": 3.7461157164356103e-06, + "loss": 0.1055, + "step": 295 + }, + { + "epoch": 2.5964912280701755, + "grad_norm": 0.7204175591468811, + "learning_rate": 3.7344388753999434e-06, + "loss": 0.1055, + "step": 296 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 0.5110165476799011, + "learning_rate": 3.7227263118026537e-06, + "loss": 0.1092, + "step": 297 + }, + { + "epoch": 2.6140350877192984, + "grad_norm": 0.6483246088027954, + "learning_rate": 3.7109783645854116e-06, + "loss": 0.1078, + "step": 298 + }, + { + "epoch": 2.6228070175438596, + "grad_norm": 0.5058422684669495, + "learning_rate": 3.699195373713831e-06, + "loss": 0.1073, + "step": 299 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.4123518764972687, + "learning_rate": 3.6873776801676265e-06, + "loss": 0.1053, + "step": 300 + }, + { + "epoch": 2.6403508771929824, + "grad_norm": 1.0864709615707397, + "learning_rate": 3.675525625930751e-06, + "loss": 0.1048, + "step": 301 + }, + { + "epoch": 2.6491228070175437, + "grad_norm": 1.0264904499053955, + "learning_rate": 3.6636395539814975e-06, + "loss": 0.1059, + "step": 302 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 0.7724822163581848, + "learning_rate": 3.651719808282573e-06, + "loss": 0.1063, + "step": 303 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.7474755644798279, + "learning_rate": 3.6397667337711475e-06, + "loss": 0.1034, + "step": 304 + }, + { + "epoch": 2.675438596491228, + "grad_norm": 0.5628909468650818, + "learning_rate": 3.6277806763488666e-06, + "loss": 0.1026, + "step": 305 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.9070547819137573, + "learning_rate": 3.6157619828718477e-06, + "loss": 0.1031, + "step": 306 + }, + { + "epoch": 2.692982456140351, + "grad_norm": 0.6968091130256653, + "learning_rate": 3.603711001140641e-06, + "loss": 0.1068, + "step": 307 + }, + { + "epoch": 2.7017543859649122, + "grad_norm": 0.3764977753162384, + "learning_rate": 3.5916280798901604e-06, + "loss": 0.1038, + "step": 308 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 5.012625694274902, + "learning_rate": 3.5795135687795984e-06, + "loss": 0.1129, + "step": 309 + }, + { + "epoch": 2.719298245614035, + "grad_norm": 0.6745572686195374, + "learning_rate": 3.567367818382303e-06, + "loss": 0.1071, + "step": 310 + }, + { + "epoch": 2.7280701754385968, + "grad_norm": 1.0659606456756592, + "learning_rate": 3.555191180175634e-06, + "loss": 0.1067, + "step": 311 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 1.7312604188919067, + "learning_rate": 3.5429840065307924e-06, + "loss": 0.1101, + "step": 312 + }, + { + "epoch": 2.745614035087719, + "grad_norm": 1.100364327430725, + "learning_rate": 3.5307466507026223e-06, + "loss": 0.1098, + "step": 313 + }, + { + "epoch": 2.754385964912281, + "grad_norm": 1.0390428304672241, + "learning_rate": 3.5184794668193893e-06, + "loss": 0.1094, + "step": 314 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.3369971811771393, + "learning_rate": 3.5061828098725327e-06, + "loss": 0.1053, + "step": 315 + }, + { + "epoch": 2.7719298245614032, + "grad_norm": 0.6130257248878479, + "learning_rate": 3.4938570357063906e-06, + "loss": 0.106, + "step": 316 + }, + { + "epoch": 2.780701754385965, + "grad_norm": 0.6387595534324646, + "learning_rate": 3.481502501007904e-06, + "loss": 0.1044, + "step": 317 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 1.0731587409973145, + "learning_rate": 3.469119563296296e-06, + "loss": 0.1097, + "step": 318 + }, + { + "epoch": 2.7982456140350878, + "grad_norm": 0.8096229434013367, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.1076, + "step": 319 + }, + { + "epoch": 2.807017543859649, + "grad_norm": 0.5034844279289246, + "learning_rate": 3.444269913009912e-06, + "loss": 0.1071, + "step": 320 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 0.675139307975769, + "learning_rate": 3.4318039195417536e-06, + "loss": 0.1039, + "step": 321 + }, + { + "epoch": 2.824561403508772, + "grad_norm": 0.7330355644226074, + "learning_rate": 3.4193109612528972e-06, + "loss": 0.1044, + "step": 322 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.6558271646499634, + "learning_rate": 3.4067913996683115e-06, + "loss": 0.1051, + "step": 323 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.8411844372749329, + "learning_rate": 3.3942455970828146e-06, + "loss": 0.1063, + "step": 324 + }, + { + "epoch": 2.8508771929824563, + "grad_norm": 0.4817325174808502, + "learning_rate": 3.3816739165505964e-06, + "loss": 0.105, + "step": 325 + }, + { + "epoch": 2.8596491228070176, + "grad_norm": 0.424554705619812, + "learning_rate": 3.3690767218747104e-06, + "loss": 0.1037, + "step": 326 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 1.0054417848587036, + "learning_rate": 3.3564543775965475e-06, + "loss": 0.1058, + "step": 327 + }, + { + "epoch": 2.8771929824561404, + "grad_norm": 0.8984584808349609, + "learning_rate": 3.3438072489852837e-06, + "loss": 0.1079, + "step": 328 + }, + { + "epoch": 2.8859649122807016, + "grad_norm": 0.6779558062553406, + "learning_rate": 3.331135702027311e-06, + "loss": 0.1046, + "step": 329 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.6931657195091248, + "learning_rate": 3.318440103415649e-06, + "loss": 0.1106, + "step": 330 + }, + { + "epoch": 2.9035087719298245, + "grad_norm": 0.705264151096344, + "learning_rate": 3.305720820539329e-06, + "loss": 0.104, + "step": 331 + }, + { + "epoch": 2.912280701754386, + "grad_norm": 0.7799407839775085, + "learning_rate": 3.2929782214727657e-06, + "loss": 0.1019, + "step": 332 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 0.7583760619163513, + "learning_rate": 3.2802126749651042e-06, + "loss": 0.1049, + "step": 333 + }, + { + "epoch": 2.9298245614035086, + "grad_norm": 0.6145837306976318, + "learning_rate": 3.2674245504295505e-06, + "loss": 0.104, + "step": 334 + }, + { + "epoch": 2.93859649122807, + "grad_norm": 0.5170779228210449, + "learning_rate": 3.254614217932679e-06, + "loss": 0.1024, + "step": 335 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.6850940585136414, + "learning_rate": 3.241782048183726e-06, + "loss": 0.1047, + "step": 336 + }, + { + "epoch": 2.956140350877193, + "grad_norm": 0.7307694554328918, + "learning_rate": 3.2289284125238597e-06, + "loss": 0.1032, + "step": 337 + }, + { + "epoch": 2.9649122807017543, + "grad_norm": 0.3386179208755493, + "learning_rate": 3.216053682915436e-06, + "loss": 0.1037, + "step": 338 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 0.7565059065818787, + "learning_rate": 3.203158231931234e-06, + "loss": 0.1048, + "step": 339 + }, + { + "epoch": 2.982456140350877, + "grad_norm": 0.7902039289474487, + "learning_rate": 3.190242432743673e-06, + "loss": 0.1068, + "step": 340 + }, + { + "epoch": 2.9912280701754383, + "grad_norm": 0.42595192790031433, + "learning_rate": 3.177306659114015e-06, + "loss": 0.1039, + "step": 341 + }, + { + "epoch": 3.0, + "grad_norm": 1.1214542388916016, + "learning_rate": 3.164351285381549e-06, + "loss": 0.1062, + "step": 342 + }, + { + "epoch": 3.008771929824561, + "grad_norm": 0.7622955441474915, + "learning_rate": 3.1513766864527577e-06, + "loss": 0.1015, + "step": 343 + }, + { + "epoch": 3.017543859649123, + "grad_norm": 0.2676297724246979, + "learning_rate": 3.1383832377904676e-06, + "loss": 0.1037, + "step": 344 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 0.8695605397224426, + "learning_rate": 3.1253713154029857e-06, + "loss": 0.1056, + "step": 345 + }, + { + "epoch": 3.0350877192982457, + "grad_norm": 0.5875906944274902, + "learning_rate": 3.1123412958332155e-06, + "loss": 0.1067, + "step": 346 + }, + { + "epoch": 3.043859649122807, + "grad_norm": 0.7699372172355652, + "learning_rate": 3.0992935561477632e-06, + "loss": 0.1035, + "step": 347 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 0.5919204354286194, + "learning_rate": 3.0862284739260247e-06, + "loss": 0.1023, + "step": 348 + }, + { + "epoch": 3.06140350877193, + "grad_norm": 1.3211849927902222, + "learning_rate": 3.07314642724926e-06, + "loss": 0.1065, + "step": 349 + }, + { + "epoch": 3.0701754385964914, + "grad_norm": 0.6359637379646301, + "learning_rate": 3.0600477946896494e-06, + "loss": 0.106, + "step": 350 + }, + { + "epoch": 3.0789473684210527, + "grad_norm": 0.35776662826538086, + "learning_rate": 3.046932955299344e-06, + "loss": 0.1046, + "step": 351 + }, + { + "epoch": 3.087719298245614, + "grad_norm": 0.6657406687736511, + "learning_rate": 3.0338022885994904e-06, + "loss": 0.1076, + "step": 352 + }, + { + "epoch": 3.0964912280701755, + "grad_norm": 0.7587785720825195, + "learning_rate": 3.0206561745692512e-06, + "loss": 0.1043, + "step": 353 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 1.1258317232131958, + "learning_rate": 3.0074949936348084e-06, + "loss": 0.1043, + "step": 354 + }, + { + "epoch": 3.1140350877192984, + "grad_norm": 0.3570568263530731, + "learning_rate": 2.9943191266583564e-06, + "loss": 0.1032, + "step": 355 + }, + { + "epoch": 3.1228070175438596, + "grad_norm": 0.843485414981842, + "learning_rate": 2.981128954927075e-06, + "loss": 0.1045, + "step": 356 + }, + { + "epoch": 3.1315789473684212, + "grad_norm": 0.5719651579856873, + "learning_rate": 2.967924860142103e-06, + "loss": 0.1052, + "step": 357 + }, + { + "epoch": 3.1403508771929824, + "grad_norm": 2.20767879486084, + "learning_rate": 2.9547072244074853e-06, + "loss": 0.1078, + "step": 358 + }, + { + "epoch": 3.1491228070175437, + "grad_norm": 0.3715457022190094, + "learning_rate": 2.941476430219122e-06, + "loss": 0.1047, + "step": 359 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.7803200483322144, + "learning_rate": 2.928232860453694e-06, + "loss": 0.1029, + "step": 360 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.5198164582252502, + "learning_rate": 2.9149768983575884e-06, + "loss": 0.1032, + "step": 361 + }, + { + "epoch": 3.175438596491228, + "grad_norm": 0.7827185988426208, + "learning_rate": 2.9017089275358017e-06, + "loss": 0.1043, + "step": 362 + }, + { + "epoch": 3.1842105263157894, + "grad_norm": 0.4000351130962372, + "learning_rate": 2.8884293319408464e-06, + "loss": 0.1071, + "step": 363 + }, + { + "epoch": 3.192982456140351, + "grad_norm": 0.9913386106491089, + "learning_rate": 2.8751384958616318e-06, + "loss": 0.1022, + "step": 364 + }, + { + "epoch": 3.2017543859649122, + "grad_norm": 0.6975695490837097, + "learning_rate": 2.861836803912353e-06, + "loss": 0.1029, + "step": 365 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 0.2372695654630661, + "learning_rate": 2.8485246410213497e-06, + "loss": 0.1015, + "step": 366 + }, + { + "epoch": 3.219298245614035, + "grad_norm": 0.447732537984848, + "learning_rate": 2.835202392419977e-06, + "loss": 0.1052, + "step": 367 + }, + { + "epoch": 3.2280701754385963, + "grad_norm": 0.6617346405982971, + "learning_rate": 2.8218704436314525e-06, + "loss": 0.1055, + "step": 368 + }, + { + "epoch": 3.236842105263158, + "grad_norm": 0.5550402402877808, + "learning_rate": 2.8085291804596995e-06, + "loss": 0.102, + "step": 369 + }, + { + "epoch": 3.245614035087719, + "grad_norm": 0.6046020984649658, + "learning_rate": 2.795178988978185e-06, + "loss": 0.1036, + "step": 370 + }, + { + "epoch": 3.254385964912281, + "grad_norm": 0.41890618205070496, + "learning_rate": 2.781820255518745e-06, + "loss": 0.1036, + "step": 371 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 0.8387415409088135, + "learning_rate": 2.768453366660408e-06, + "loss": 0.1076, + "step": 372 + }, + { + "epoch": 3.2719298245614037, + "grad_norm": 0.5318773984909058, + "learning_rate": 2.755078709218203e-06, + "loss": 0.1052, + "step": 373 + }, + { + "epoch": 3.280701754385965, + "grad_norm": 0.6617523431777954, + "learning_rate": 2.741696670231969e-06, + "loss": 0.1049, + "step": 374 + }, + { + "epoch": 3.2894736842105265, + "grad_norm": 1.0190025568008423, + "learning_rate": 2.728307636955156e-06, + "loss": 0.1034, + "step": 375 + }, + { + "epoch": 3.2982456140350878, + "grad_norm": 0.6924716234207153, + "learning_rate": 2.714911996843617e-06, + "loss": 0.1065, + "step": 376 + }, + { + "epoch": 3.307017543859649, + "grad_norm": 0.42501118779182434, + "learning_rate": 2.701510137544393e-06, + "loss": 0.1019, + "step": 377 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 0.844886064529419, + "learning_rate": 2.6881024468845e-06, + "loss": 0.1047, + "step": 378 + }, + { + "epoch": 3.324561403508772, + "grad_norm": 0.46512728929519653, + "learning_rate": 2.674689312859704e-06, + "loss": 0.1043, + "step": 379 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.6242017149925232, + "learning_rate": 2.6612711236232915e-06, + "loss": 0.1046, + "step": 380 + }, + { + "epoch": 3.3421052631578947, + "grad_norm": 0.6578526496887207, + "learning_rate": 2.6478482674748375e-06, + "loss": 0.1031, + "step": 381 + }, + { + "epoch": 3.3508771929824563, + "grad_norm": 0.4822542667388916, + "learning_rate": 2.63442113284897e-06, + "loss": 0.1053, + "step": 382 + }, + { + "epoch": 3.3596491228070176, + "grad_norm": 0.48255595564842224, + "learning_rate": 2.6209901083041307e-06, + "loss": 0.1058, + "step": 383 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.6624025702476501, + "learning_rate": 2.6075555825113265e-06, + "loss": 0.1066, + "step": 384 + }, + { + "epoch": 3.3771929824561404, + "grad_norm": 0.6962618827819824, + "learning_rate": 2.5941179442428864e-06, + "loss": 0.102, + "step": 385 + }, + { + "epoch": 3.3859649122807016, + "grad_norm": 0.4976450502872467, + "learning_rate": 2.580677582361208e-06, + "loss": 0.1011, + "step": 386 + }, + { + "epoch": 3.3947368421052633, + "grad_norm": 0.5283737182617188, + "learning_rate": 2.5672348858075053e-06, + "loss": 0.1057, + "step": 387 + }, + { + "epoch": 3.4035087719298245, + "grad_norm": 0.32338738441467285, + "learning_rate": 2.553790243590556e-06, + "loss": 0.1015, + "step": 388 + }, + { + "epoch": 3.412280701754386, + "grad_norm": 0.7909435629844666, + "learning_rate": 2.5403440447754385e-06, + "loss": 0.1036, + "step": 389 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 0.6297115087509155, + "learning_rate": 2.5268966784722792e-06, + "loss": 0.1042, + "step": 390 + }, + { + "epoch": 3.4298245614035086, + "grad_norm": 0.32988762855529785, + "learning_rate": 2.513448533824988e-06, + "loss": 0.1059, + "step": 391 + }, + { + "epoch": 3.43859649122807, + "grad_norm": 0.9211220145225525, + "learning_rate": 2.5e-06, + "loss": 0.1015, + "step": 392 + }, + { + "epoch": 3.4473684210526314, + "grad_norm": 1.2157588005065918, + "learning_rate": 2.486551466175013e-06, + "loss": 0.1035, + "step": 393 + }, + { + "epoch": 3.456140350877193, + "grad_norm": 0.4786648452281952, + "learning_rate": 2.4731033215277216e-06, + "loss": 0.1026, + "step": 394 + }, + { + "epoch": 3.4649122807017543, + "grad_norm": 0.37398242950439453, + "learning_rate": 2.4596559552245623e-06, + "loss": 0.1044, + "step": 395 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 0.5536217093467712, + "learning_rate": 2.446209756409445e-06, + "loss": 0.1043, + "step": 396 + }, + { + "epoch": 3.482456140350877, + "grad_norm": 0.708406925201416, + "learning_rate": 2.432765114192495e-06, + "loss": 0.1046, + "step": 397 + }, + { + "epoch": 3.4912280701754383, + "grad_norm": 0.7140893340110779, + "learning_rate": 2.4193224176387926e-06, + "loss": 0.1039, + "step": 398 + }, + { + "epoch": 3.5, + "grad_norm": 0.8078088760375977, + "learning_rate": 2.4058820557571144e-06, + "loss": 0.1013, + "step": 399 + }, + { + "epoch": 3.5087719298245617, + "grad_norm": 0.7129591107368469, + "learning_rate": 2.3924444174886735e-06, + "loss": 0.1057, + "step": 400 + }, + { + "epoch": 3.517543859649123, + "grad_norm": 1.293412446975708, + "learning_rate": 2.37900989169587e-06, + "loss": 0.1081, + "step": 401 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 0.7235314249992371, + "learning_rate": 2.3655788671510314e-06, + "loss": 0.1054, + "step": 402 + }, + { + "epoch": 3.5350877192982457, + "grad_norm": 0.6008841395378113, + "learning_rate": 2.3521517325251637e-06, + "loss": 0.1033, + "step": 403 + }, + { + "epoch": 3.543859649122807, + "grad_norm": 0.6819609999656677, + "learning_rate": 2.3387288763767097e-06, + "loss": 0.1019, + "step": 404 + }, + { + "epoch": 3.5526315789473686, + "grad_norm": 0.5696406960487366, + "learning_rate": 2.325310687140296e-06, + "loss": 0.1043, + "step": 405 + }, + { + "epoch": 3.56140350877193, + "grad_norm": 0.8597077131271362, + "learning_rate": 2.3118975531155003e-06, + "loss": 0.1037, + "step": 406 + }, + { + "epoch": 3.5701754385964914, + "grad_norm": 0.43985217809677124, + "learning_rate": 2.2984898624556075e-06, + "loss": 0.105, + "step": 407 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.5448469519615173, + "learning_rate": 2.2850880031563845e-06, + "loss": 0.1037, + "step": 408 + }, + { + "epoch": 3.587719298245614, + "grad_norm": 0.8221977949142456, + "learning_rate": 2.271692363044845e-06, + "loss": 0.1015, + "step": 409 + }, + { + "epoch": 3.5964912280701755, + "grad_norm": 0.9838594198226929, + "learning_rate": 2.2583033297680316e-06, + "loss": 0.1085, + "step": 410 + }, + { + "epoch": 3.6052631578947367, + "grad_norm": 1.034848928451538, + "learning_rate": 2.2449212907817985e-06, + "loss": 0.104, + "step": 411 + }, + { + "epoch": 3.6140350877192984, + "grad_norm": 1.0788371562957764, + "learning_rate": 2.2315466333395927e-06, + "loss": 0.1033, + "step": 412 + }, + { + "epoch": 3.6228070175438596, + "grad_norm": 0.49096915125846863, + "learning_rate": 2.2181797444812557e-06, + "loss": 0.1044, + "step": 413 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 1.309685230255127, + "learning_rate": 2.204821011021815e-06, + "loss": 0.1036, + "step": 414 + }, + { + "epoch": 3.6403508771929824, + "grad_norm": 0.5014146566390991, + "learning_rate": 2.191470819540301e-06, + "loss": 0.104, + "step": 415 + }, + { + "epoch": 3.6491228070175437, + "grad_norm": 0.770470380783081, + "learning_rate": 2.178129556368548e-06, + "loss": 0.1049, + "step": 416 + }, + { + "epoch": 3.6578947368421053, + "grad_norm": 0.4639376699924469, + "learning_rate": 2.1647976075800235e-06, + "loss": 0.1047, + "step": 417 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 1.101885437965393, + "learning_rate": 2.151475358978652e-06, + "loss": 0.1035, + "step": 418 + }, + { + "epoch": 3.675438596491228, + "grad_norm": 0.5644329786300659, + "learning_rate": 2.138163196087648e-06, + "loss": 0.103, + "step": 419 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 1.1015008687973022, + "learning_rate": 2.1248615041383686e-06, + "loss": 0.1054, + "step": 420 + }, + { + "epoch": 3.692982456140351, + "grad_norm": 0.7311366200447083, + "learning_rate": 2.111570668059155e-06, + "loss": 0.1043, + "step": 421 + }, + { + "epoch": 3.7017543859649122, + "grad_norm": 0.38242173194885254, + "learning_rate": 2.098291072464199e-06, + "loss": 0.1041, + "step": 422 + }, + { + "epoch": 3.7105263157894735, + "grad_norm": 1.231512188911438, + "learning_rate": 2.085023101642412e-06, + "loss": 0.1021, + "step": 423 + }, + { + "epoch": 3.719298245614035, + "grad_norm": 0.41761213541030884, + "learning_rate": 2.0717671395463063e-06, + "loss": 0.1062, + "step": 424 + }, + { + "epoch": 3.7280701754385968, + "grad_norm": 0.4593309462070465, + "learning_rate": 2.0585235697808794e-06, + "loss": 0.1012, + "step": 425 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 0.9147135019302368, + "learning_rate": 2.0452927755925155e-06, + "loss": 0.1046, + "step": 426 + }, + { + "epoch": 3.745614035087719, + "grad_norm": 0.39639535546302795, + "learning_rate": 2.0320751398578984e-06, + "loss": 0.1018, + "step": 427 + }, + { + "epoch": 3.754385964912281, + "grad_norm": 0.688010573387146, + "learning_rate": 2.0188710450729255e-06, + "loss": 0.104, + "step": 428 + }, + { + "epoch": 3.763157894736842, + "grad_norm": 0.5140353441238403, + "learning_rate": 2.005680873341644e-06, + "loss": 0.1033, + "step": 429 + }, + { + "epoch": 3.7719298245614032, + "grad_norm": 0.5970481634140015, + "learning_rate": 1.992505006365191e-06, + "loss": 0.1044, + "step": 430 + }, + { + "epoch": 3.780701754385965, + "grad_norm": 0.551162838935852, + "learning_rate": 1.9793438254307496e-06, + "loss": 0.1042, + "step": 431 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.5344637632369995, + "learning_rate": 1.96619771140051e-06, + "loss": 0.1042, + "step": 432 + }, + { + "epoch": 3.7982456140350878, + "grad_norm": 0.5357667207717896, + "learning_rate": 1.9530670447006566e-06, + "loss": 0.101, + "step": 433 + }, + { + "epoch": 3.807017543859649, + "grad_norm": 1.2536660432815552, + "learning_rate": 1.9399522053103514e-06, + "loss": 0.1008, + "step": 434 + }, + { + "epoch": 3.8157894736842106, + "grad_norm": 0.4888289272785187, + "learning_rate": 1.926853572750741e-06, + "loss": 0.1028, + "step": 435 + }, + { + "epoch": 3.824561403508772, + "grad_norm": 0.5810404419898987, + "learning_rate": 1.913771526073976e-06, + "loss": 0.1031, + "step": 436 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.5372979044914246, + "learning_rate": 1.9007064438522374e-06, + "loss": 0.107, + "step": 437 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 0.8293616771697998, + "learning_rate": 1.8876587041667855e-06, + "loss": 0.1033, + "step": 438 + }, + { + "epoch": 3.8508771929824563, + "grad_norm": 2.361504554748535, + "learning_rate": 1.8746286845970145e-06, + "loss": 0.1098, + "step": 439 + }, + { + "epoch": 3.8596491228070176, + "grad_norm": 0.70230633020401, + "learning_rate": 1.8616167622095328e-06, + "loss": 0.1034, + "step": 440 + }, + { + "epoch": 3.8684210526315788, + "grad_norm": 0.6323564052581787, + "learning_rate": 1.8486233135472436e-06, + "loss": 0.1058, + "step": 441 + }, + { + "epoch": 3.8771929824561404, + "grad_norm": 0.48205408453941345, + "learning_rate": 1.8356487146184517e-06, + "loss": 0.105, + "step": 442 + }, + { + "epoch": 3.8859649122807016, + "grad_norm": 0.6996872425079346, + "learning_rate": 1.8226933408859864e-06, + "loss": 0.1083, + "step": 443 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 0.4114651679992676, + "learning_rate": 1.8097575672563278e-06, + "loss": 0.1003, + "step": 444 + }, + { + "epoch": 3.9035087719298245, + "grad_norm": 0.5234648585319519, + "learning_rate": 1.7968417680687666e-06, + "loss": 0.1019, + "step": 445 + }, + { + "epoch": 3.912280701754386, + "grad_norm": 1.0571491718292236, + "learning_rate": 1.7839463170845641e-06, + "loss": 0.1003, + "step": 446 + }, + { + "epoch": 3.9210526315789473, + "grad_norm": 0.7470094561576843, + "learning_rate": 1.7710715874761408e-06, + "loss": 0.1061, + "step": 447 + }, + { + "epoch": 3.9298245614035086, + "grad_norm": 0.901695191860199, + "learning_rate": 1.7582179518162742e-06, + "loss": 0.1015, + "step": 448 + }, + { + "epoch": 3.93859649122807, + "grad_norm": 1.0251179933547974, + "learning_rate": 1.7453857820673215e-06, + "loss": 0.1, + "step": 449 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 0.5065406560897827, + "learning_rate": 1.7325754495704508e-06, + "loss": 0.1036, + "step": 450 + }, + { + "epoch": 3.956140350877193, + "grad_norm": 0.9541155099868774, + "learning_rate": 1.7197873250348962e-06, + "loss": 0.1015, + "step": 451 + }, + { + "epoch": 3.9649122807017543, + "grad_norm": 0.6264199018478394, + "learning_rate": 1.7070217785272354e-06, + "loss": 0.1026, + "step": 452 + }, + { + "epoch": 3.973684210526316, + "grad_norm": 0.6260526180267334, + "learning_rate": 1.6942791794606716e-06, + "loss": 0.1039, + "step": 453 + }, + { + "epoch": 3.982456140350877, + "grad_norm": 0.4730931222438812, + "learning_rate": 1.681559896584352e-06, + "loss": 0.1045, + "step": 454 + }, + { + "epoch": 3.9912280701754383, + "grad_norm": 0.5011451840400696, + "learning_rate": 1.668864297972689e-06, + "loss": 0.1062, + "step": 455 + }, + { + "epoch": 4.0, + "grad_norm": 1.0113046169281006, + "learning_rate": 1.6561927510147172e-06, + "loss": 0.1005, + "step": 456 + }, + { + "epoch": 4.008771929824562, + "grad_norm": 0.6017364263534546, + "learning_rate": 1.6435456224034536e-06, + "loss": 0.1042, + "step": 457 + }, + { + "epoch": 4.017543859649122, + "grad_norm": 0.6874931454658508, + "learning_rate": 1.63092327812529e-06, + "loss": 0.102, + "step": 458 + }, + { + "epoch": 4.026315789473684, + "grad_norm": 1.311024785041809, + "learning_rate": 1.6183260834494053e-06, + "loss": 0.1063, + "step": 459 + }, + { + "epoch": 4.035087719298246, + "grad_norm": 0.3640352785587311, + "learning_rate": 1.6057544029171863e-06, + "loss": 0.1039, + "step": 460 + }, + { + "epoch": 4.043859649122807, + "grad_norm": 0.6056526303291321, + "learning_rate": 1.5932086003316893e-06, + "loss": 0.099, + "step": 461 + }, + { + "epoch": 4.052631578947368, + "grad_norm": 0.5407683849334717, + "learning_rate": 1.5806890387471025e-06, + "loss": 0.1038, + "step": 462 + }, + { + "epoch": 4.06140350877193, + "grad_norm": 0.7054030895233154, + "learning_rate": 1.5681960804582474e-06, + "loss": 0.1001, + "step": 463 + }, + { + "epoch": 4.0701754385964914, + "grad_norm": 0.8736140727996826, + "learning_rate": 1.5557300869900876e-06, + "loss": 0.1035, + "step": 464 + }, + { + "epoch": 4.078947368421052, + "grad_norm": 0.6689419746398926, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.1052, + "step": 465 + }, + { + "epoch": 4.087719298245614, + "grad_norm": 0.8937819600105286, + "learning_rate": 1.530880436703705e-06, + "loss": 0.1024, + "step": 466 + }, + { + "epoch": 4.0964912280701755, + "grad_norm": 0.24332484602928162, + "learning_rate": 1.518497498992097e-06, + "loss": 0.0984, + "step": 467 + }, + { + "epoch": 4.105263157894737, + "grad_norm": 0.9716914296150208, + "learning_rate": 1.5061429642936107e-06, + "loss": 0.1012, + "step": 468 + }, + { + "epoch": 4.114035087719298, + "grad_norm": 0.5864392518997192, + "learning_rate": 1.4938171901274678e-06, + "loss": 0.1029, + "step": 469 + }, + { + "epoch": 4.12280701754386, + "grad_norm": 0.4616212546825409, + "learning_rate": 1.4815205331806113e-06, + "loss": 0.1035, + "step": 470 + }, + { + "epoch": 4.131578947368421, + "grad_norm": 0.5989730954170227, + "learning_rate": 1.4692533492973775e-06, + "loss": 0.1036, + "step": 471 + }, + { + "epoch": 4.140350877192983, + "grad_norm": 0.7900629639625549, + "learning_rate": 1.4570159934692085e-06, + "loss": 0.1044, + "step": 472 + }, + { + "epoch": 4.149122807017544, + "grad_norm": 0.5659995675086975, + "learning_rate": 1.4448088198243668e-06, + "loss": 0.1024, + "step": 473 + }, + { + "epoch": 4.157894736842105, + "grad_norm": 0.7867873311042786, + "learning_rate": 1.432632181617698e-06, + "loss": 0.1038, + "step": 474 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.44385358691215515, + "learning_rate": 1.4204864312204033e-06, + "loss": 0.1006, + "step": 475 + }, + { + "epoch": 4.175438596491228, + "grad_norm": 0.3909265697002411, + "learning_rate": 1.4083719201098404e-06, + "loss": 0.1019, + "step": 476 + }, + { + "epoch": 4.184210526315789, + "grad_norm": 0.7079223990440369, + "learning_rate": 1.3962889988593609e-06, + "loss": 0.1019, + "step": 477 + }, + { + "epoch": 4.192982456140351, + "grad_norm": 0.6703695058822632, + "learning_rate": 1.3842380171281522e-06, + "loss": 0.1063, + "step": 478 + }, + { + "epoch": 4.201754385964913, + "grad_norm": 0.3477051556110382, + "learning_rate": 1.3722193236511344e-06, + "loss": 0.1004, + "step": 479 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 0.7296048402786255, + "learning_rate": 1.3602332662288536e-06, + "loss": 0.1057, + "step": 480 + }, + { + "epoch": 4.219298245614035, + "grad_norm": 0.7007803916931152, + "learning_rate": 1.348280191717427e-06, + "loss": 0.1007, + "step": 481 + }, + { + "epoch": 4.228070175438597, + "grad_norm": 0.948968231678009, + "learning_rate": 1.3363604460185031e-06, + "loss": 0.1005, + "step": 482 + }, + { + "epoch": 4.2368421052631575, + "grad_norm": 0.6567812561988831, + "learning_rate": 1.3244743740692496e-06, + "loss": 0.1016, + "step": 483 + }, + { + "epoch": 4.245614035087719, + "grad_norm": 0.5390146374702454, + "learning_rate": 1.3126223198323752e-06, + "loss": 0.1025, + "step": 484 + }, + { + "epoch": 4.254385964912281, + "grad_norm": 0.43638724088668823, + "learning_rate": 1.3008046262861696e-06, + "loss": 0.1053, + "step": 485 + }, + { + "epoch": 4.2631578947368425, + "grad_norm": 0.43589839339256287, + "learning_rate": 1.289021635414589e-06, + "loss": 0.1036, + "step": 486 + }, + { + "epoch": 4.271929824561403, + "grad_norm": 0.3999694585800171, + "learning_rate": 1.277273688197346e-06, + "loss": 0.1023, + "step": 487 + }, + { + "epoch": 4.280701754385965, + "grad_norm": 0.6314297914505005, + "learning_rate": 1.265561124600057e-06, + "loss": 0.0993, + "step": 488 + }, + { + "epoch": 4.2894736842105265, + "grad_norm": 0.566033124923706, + "learning_rate": 1.2538842835643906e-06, + "loss": 0.1029, + "step": 489 + }, + { + "epoch": 4.298245614035087, + "grad_norm": 0.6713336110115051, + "learning_rate": 1.2422435029982669e-06, + "loss": 0.1002, + "step": 490 + }, + { + "epoch": 4.307017543859649, + "grad_norm": 0.428574800491333, + "learning_rate": 1.2306391197660797e-06, + "loss": 0.1028, + "step": 491 + }, + { + "epoch": 4.315789473684211, + "grad_norm": 0.637745201587677, + "learning_rate": 1.219071469678941e-06, + "loss": 0.1009, + "step": 492 + }, + { + "epoch": 4.324561403508772, + "grad_norm": 0.8204445242881775, + "learning_rate": 1.2075408874849747e-06, + "loss": 0.099, + "step": 493 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 1.010758876800537, + "learning_rate": 1.1960477068596155e-06, + "loss": 0.1006, + "step": 494 + }, + { + "epoch": 4.342105263157895, + "grad_norm": 0.908112108707428, + "learning_rate": 1.1845922603959677e-06, + "loss": 0.1047, + "step": 495 + }, + { + "epoch": 4.350877192982456, + "grad_norm": 1.0254642963409424, + "learning_rate": 1.173174879595166e-06, + "loss": 0.0991, + "step": 496 + }, + { + "epoch": 4.359649122807017, + "grad_norm": 0.5159414410591125, + "learning_rate": 1.1617958948567967e-06, + "loss": 0.0978, + "step": 497 + }, + { + "epoch": 4.368421052631579, + "grad_norm": 0.9525816440582275, + "learning_rate": 1.1504556354693227e-06, + "loss": 0.1051, + "step": 498 + }, + { + "epoch": 4.37719298245614, + "grad_norm": 0.9321548938751221, + "learning_rate": 1.1391544296005652e-06, + "loss": 0.1011, + "step": 499 + }, + { + "epoch": 4.385964912280702, + "grad_norm": 0.7308889627456665, + "learning_rate": 1.1278926042882026e-06, + "loss": 0.1002, + "step": 500 + }, + { + "epoch": 4.394736842105263, + "grad_norm": 0.9508903622627258, + "learning_rate": 1.116670485430304e-06, + "loss": 0.1013, + "step": 501 + }, + { + "epoch": 4.4035087719298245, + "grad_norm": 0.5174031853675842, + "learning_rate": 1.1054883977759067e-06, + "loss": 0.104, + "step": 502 + }, + { + "epoch": 4.412280701754386, + "grad_norm": 0.4504610598087311, + "learning_rate": 1.0943466649156061e-06, + "loss": 0.1013, + "step": 503 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 0.5650261044502258, + "learning_rate": 1.0832456092722063e-06, + "loss": 0.0995, + "step": 504 + }, + { + "epoch": 4.4298245614035086, + "grad_norm": 0.37759432196617126, + "learning_rate": 1.0721855520913751e-06, + "loss": 0.1058, + "step": 505 + }, + { + "epoch": 4.43859649122807, + "grad_norm": 0.7238495349884033, + "learning_rate": 1.0611668134323577e-06, + "loss": 0.1012, + "step": 506 + }, + { + "epoch": 4.447368421052632, + "grad_norm": 0.6301494240760803, + "learning_rate": 1.0501897121587127e-06, + "loss": 0.1009, + "step": 507 + }, + { + "epoch": 4.456140350877193, + "grad_norm": 0.9531002044677734, + "learning_rate": 1.0392545659290789e-06, + "loss": 0.1021, + "step": 508 + }, + { + "epoch": 4.464912280701754, + "grad_norm": 0.4423767924308777, + "learning_rate": 1.0283616911879943e-06, + "loss": 0.1024, + "step": 509 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 0.5573019981384277, + "learning_rate": 1.0175114031567246e-06, + "loss": 0.1011, + "step": 510 + }, + { + "epoch": 4.482456140350878, + "grad_norm": 0.9792631268501282, + "learning_rate": 1.0067040158241555e-06, + "loss": 0.1039, + "step": 511 + }, + { + "epoch": 4.491228070175438, + "grad_norm": 1.7911303043365479, + "learning_rate": 9.95939841937693e-07, + "loss": 0.104, + "step": 512 + }, + { + "epoch": 4.5, + "grad_norm": 0.5825617909431458, + "learning_rate": 9.852191929942262e-07, + "loss": 0.0987, + "step": 513 + }, + { + "epoch": 4.508771929824562, + "grad_norm": 0.3129921555519104, + "learning_rate": 9.745423792310996e-07, + "loss": 0.0979, + "step": 514 + }, + { + "epoch": 4.517543859649123, + "grad_norm": 0.5376678705215454, + "learning_rate": 9.63909709617146e-07, + "loss": 0.0998, + "step": 515 + }, + { + "epoch": 4.526315789473684, + "grad_norm": 0.48920008540153503, + "learning_rate": 9.533214918437422e-07, + "loss": 0.1017, + "step": 516 + }, + { + "epoch": 4.535087719298246, + "grad_norm": 0.36829131841659546, + "learning_rate": 9.427780323159006e-07, + "loss": 0.1004, + "step": 517 + }, + { + "epoch": 4.543859649122807, + "grad_norm": 0.5459544658660889, + "learning_rate": 9.322796361434111e-07, + "loss": 0.1041, + "step": 518 + }, + { + "epoch": 4.552631578947368, + "grad_norm": 0.8460657000541687, + "learning_rate": 9.218266071320015e-07, + "loss": 0.1012, + "step": 519 + }, + { + "epoch": 4.56140350877193, + "grad_norm": 0.7692683339118958, + "learning_rate": 9.114192477745568e-07, + "loss": 0.1013, + "step": 520 + }, + { + "epoch": 4.5701754385964914, + "grad_norm": 0.4503592550754547, + "learning_rate": 9.010578592423544e-07, + "loss": 0.107, + "step": 521 + }, + { + "epoch": 4.578947368421053, + "grad_norm": 0.9348855018615723, + "learning_rate": 8.907427413763572e-07, + "loss": 0.102, + "step": 522 + }, + { + "epoch": 4.587719298245614, + "grad_norm": 0.7902988791465759, + "learning_rate": 8.804741926785335e-07, + "loss": 0.1032, + "step": 523 + }, + { + "epoch": 4.5964912280701755, + "grad_norm": 0.5444673299789429, + "learning_rate": 8.702525103032186e-07, + "loss": 0.0993, + "step": 524 + }, + { + "epoch": 4.605263157894737, + "grad_norm": 0.728112518787384, + "learning_rate": 8.60077990048517e-07, + "loss": 0.1021, + "step": 525 + }, + { + "epoch": 4.614035087719298, + "grad_norm": 0.5250695943832397, + "learning_rate": 8.499509263477388e-07, + "loss": 0.1018, + "step": 526 + }, + { + "epoch": 4.62280701754386, + "grad_norm": 0.3112829625606537, + "learning_rate": 8.398716122608868e-07, + "loss": 0.1037, + "step": 527 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 0.9097342491149902, + "learning_rate": 8.298403394661658e-07, + "loss": 0.1015, + "step": 528 + }, + { + "epoch": 4.640350877192983, + "grad_norm": 0.6663810014724731, + "learning_rate": 8.198573982515537e-07, + "loss": 0.1038, + "step": 529 + }, + { + "epoch": 4.649122807017544, + "grad_norm": 1.1880309581756592, + "learning_rate": 8.099230775063879e-07, + "loss": 0.1044, + "step": 530 + }, + { + "epoch": 4.657894736842105, + "grad_norm": 0.6492993831634521, + "learning_rate": 8.000376647130165e-07, + "loss": 0.103, + "step": 531 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.43723204731941223, + "learning_rate": 7.902014459384744e-07, + "loss": 0.1025, + "step": 532 + }, + { + "epoch": 4.675438596491228, + "grad_norm": 0.8422684669494629, + "learning_rate": 7.804147058262015e-07, + "loss": 0.1035, + "step": 533 + }, + { + "epoch": 4.684210526315789, + "grad_norm": 0.6502094268798828, + "learning_rate": 7.706777275878161e-07, + "loss": 0.0994, + "step": 534 + }, + { + "epoch": 4.692982456140351, + "grad_norm": 0.5709391236305237, + "learning_rate": 7.609907929949045e-07, + "loss": 0.1056, + "step": 535 + }, + { + "epoch": 4.701754385964913, + "grad_norm": 0.4126770496368408, + "learning_rate": 7.513541823708828e-07, + "loss": 0.101, + "step": 536 + }, + { + "epoch": 4.7105263157894735, + "grad_norm": 0.5016621947288513, + "learning_rate": 7.417681745828706e-07, + "loss": 0.0999, + "step": 537 + }, + { + "epoch": 4.719298245614035, + "grad_norm": 0.8139487504959106, + "learning_rate": 7.322330470336314e-07, + "loss": 0.0984, + "step": 538 + }, + { + "epoch": 4.728070175438597, + "grad_norm": 0.5805723667144775, + "learning_rate": 7.227490756535396e-07, + "loss": 0.1011, + "step": 539 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 0.7970795631408691, + "learning_rate": 7.133165348925978e-07, + "loss": 0.1016, + "step": 540 + }, + { + "epoch": 4.745614035087719, + "grad_norm": 0.6336880326271057, + "learning_rate": 7.039356977124937e-07, + "loss": 0.1027, + "step": 541 + }, + { + "epoch": 4.754385964912281, + "grad_norm": 0.2953254282474518, + "learning_rate": 6.946068355786992e-07, + "loss": 0.1022, + "step": 542 + }, + { + "epoch": 4.7631578947368425, + "grad_norm": 0.5646472573280334, + "learning_rate": 6.853302184526217e-07, + "loss": 0.0998, + "step": 543 + }, + { + "epoch": 4.771929824561403, + "grad_norm": 0.6545483469963074, + "learning_rate": 6.761061147837808e-07, + "loss": 0.0985, + "step": 544 + }, + { + "epoch": 4.780701754385965, + "grad_norm": 0.8741705417633057, + "learning_rate": 6.669347915020524e-07, + "loss": 0.1006, + "step": 545 + }, + { + "epoch": 4.7894736842105265, + "grad_norm": 0.8579487204551697, + "learning_rate": 6.578165140099318e-07, + "loss": 0.1037, + "step": 546 + }, + { + "epoch": 4.798245614035087, + "grad_norm": 1.0744833946228027, + "learning_rate": 6.487515461748631e-07, + "loss": 0.1017, + "step": 547 + }, + { + "epoch": 4.807017543859649, + "grad_norm": 0.4954414367675781, + "learning_rate": 6.397401503215992e-07, + "loss": 0.1006, + "step": 548 + }, + { + "epoch": 4.815789473684211, + "grad_norm": 0.525191068649292, + "learning_rate": 6.307825872246076e-07, + "loss": 0.1024, + "step": 549 + }, + { + "epoch": 4.824561403508772, + "grad_norm": 0.8922368288040161, + "learning_rate": 6.218791161005336e-07, + "loss": 0.0999, + "step": 550 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.6471604704856873, + "learning_rate": 6.13029994600686e-07, + "loss": 0.0994, + "step": 551 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 0.49826696515083313, + "learning_rate": 6.042354788035943e-07, + "loss": 0.1003, + "step": 552 + }, + { + "epoch": 4.850877192982456, + "grad_norm": 0.7908043265342712, + "learning_rate": 5.954958232075858e-07, + "loss": 0.1003, + "step": 553 + }, + { + "epoch": 4.859649122807017, + "grad_norm": 0.40011560916900635, + "learning_rate": 5.868112807234313e-07, + "loss": 0.0991, + "step": 554 + }, + { + "epoch": 4.868421052631579, + "grad_norm": 0.9797350764274597, + "learning_rate": 5.781821026670203e-07, + "loss": 0.1005, + "step": 555 + }, + { + "epoch": 4.87719298245614, + "grad_norm": 0.4581677317619324, + "learning_rate": 5.696085387520894e-07, + "loss": 0.1013, + "step": 556 + }, + { + "epoch": 4.885964912280702, + "grad_norm": 0.6596454381942749, + "learning_rate": 5.610908370829981e-07, + "loss": 0.1028, + "step": 557 + }, + { + "epoch": 4.894736842105263, + "grad_norm": 0.5106292963027954, + "learning_rate": 5.526292441475448e-07, + "loss": 0.1023, + "step": 558 + }, + { + "epoch": 4.9035087719298245, + "grad_norm": 0.5137461423873901, + "learning_rate": 5.442240048098402e-07, + "loss": 0.1036, + "step": 559 + }, + { + "epoch": 4.912280701754386, + "grad_norm": 0.4619182348251343, + "learning_rate": 5.358753623032137e-07, + "loss": 0.0979, + "step": 560 + }, + { + "epoch": 4.921052631578947, + "grad_norm": 0.5350770354270935, + "learning_rate": 5.275835582231833e-07, + "loss": 0.0992, + "step": 561 + }, + { + "epoch": 4.9298245614035086, + "grad_norm": 0.7599822878837585, + "learning_rate": 5.193488325204551e-07, + "loss": 0.0983, + "step": 562 + }, + { + "epoch": 4.93859649122807, + "grad_norm": 0.47537004947662354, + "learning_rate": 5.111714234939868e-07, + "loss": 0.1004, + "step": 563 + }, + { + "epoch": 4.947368421052632, + "grad_norm": 0.597273588180542, + "learning_rate": 5.030515677840883e-07, + "loss": 0.1015, + "step": 564 + }, + { + "epoch": 4.956140350877193, + "grad_norm": 0.7155528664588928, + "learning_rate": 4.949895003655728e-07, + "loss": 0.1017, + "step": 565 + }, + { + "epoch": 4.964912280701754, + "grad_norm": 0.530358612537384, + "learning_rate": 4.869854545409627e-07, + "loss": 0.0998, + "step": 566 + }, + { + "epoch": 4.973684210526316, + "grad_norm": 0.6721721291542053, + "learning_rate": 4.790396619337286e-07, + "loss": 0.1003, + "step": 567 + }, + { + "epoch": 4.982456140350877, + "grad_norm": 0.8486731648445129, + "learning_rate": 4.711523524815978e-07, + "loss": 0.0996, + "step": 568 + }, + { + "epoch": 4.991228070175438, + "grad_norm": 0.7072808742523193, + "learning_rate": 4.633237544298891e-07, + "loss": 0.1004, + "step": 569 + }, + { + "epoch": 5.0, + "grad_norm": 0.41283953189849854, + "learning_rate": 4.555540943249187e-07, + "loss": 0.1026, + "step": 570 + }, + { + "epoch": 5.008771929824562, + "grad_norm": 0.7376545667648315, + "learning_rate": 4.478435970074341e-07, + "loss": 0.1001, + "step": 571 + }, + { + "epoch": 5.017543859649122, + "grad_norm": 0.42418381571769714, + "learning_rate": 4.401924856061146e-07, + "loss": 0.0998, + "step": 572 + }, + { + "epoch": 5.026315789473684, + "grad_norm": 0.5682939291000366, + "learning_rate": 4.326009815311125e-07, + "loss": 0.1015, + "step": 573 + }, + { + "epoch": 5.035087719298246, + "grad_norm": 0.6277433633804321, + "learning_rate": 4.250693044676429e-07, + "loss": 0.1067, + "step": 574 + }, + { + "epoch": 5.043859649122807, + "grad_norm": 0.8414298892021179, + "learning_rate": 4.175976723696337e-07, + "loss": 0.1007, + "step": 575 + }, + { + "epoch": 5.052631578947368, + "grad_norm": 0.48310723900794983, + "learning_rate": 4.1018630145340744e-07, + "loss": 0.0966, + "step": 576 + }, + { + "epoch": 5.06140350877193, + "grad_norm": 0.7204103469848633, + "learning_rate": 4.028354061914369e-07, + "loss": 0.1001, + "step": 577 + }, + { + "epoch": 5.0701754385964914, + "grad_norm": 0.4454537630081177, + "learning_rate": 3.9554519930612683e-07, + "loss": 0.0975, + "step": 578 + }, + { + "epoch": 5.078947368421052, + "grad_norm": 0.71866774559021, + "learning_rate": 3.88315891763667e-07, + "loss": 0.0995, + "step": 579 + }, + { + "epoch": 5.087719298245614, + "grad_norm": 0.5037544369697571, + "learning_rate": 3.811476927679228e-07, + "loss": 0.1003, + "step": 580 + }, + { + "epoch": 5.0964912280701755, + "grad_norm": 0.4898604154586792, + "learning_rate": 3.7404080975438073e-07, + "loss": 0.1006, + "step": 581 + }, + { + "epoch": 5.105263157894737, + "grad_norm": 0.5109504461288452, + "learning_rate": 3.6699544838415035e-07, + "loss": 0.0975, + "step": 582 + }, + { + "epoch": 5.114035087719298, + "grad_norm": 0.5904539227485657, + "learning_rate": 3.600118125380056e-07, + "loss": 0.1027, + "step": 583 + }, + { + "epoch": 5.12280701754386, + "grad_norm": 0.7211642265319824, + "learning_rate": 3.5309010431049284e-07, + "loss": 0.1025, + "step": 584 + }, + { + "epoch": 5.131578947368421, + "grad_norm": 0.6350153088569641, + "learning_rate": 3.462305240040739e-07, + "loss": 0.1003, + "step": 585 + }, + { + "epoch": 5.140350877192983, + "grad_norm": 0.4940623641014099, + "learning_rate": 3.394332701233391e-07, + "loss": 0.1009, + "step": 586 + }, + { + "epoch": 5.149122807017544, + "grad_norm": 0.6850067973136902, + "learning_rate": 3.326985393692539e-07, + "loss": 0.0976, + "step": 587 + }, + { + "epoch": 5.157894736842105, + "grad_norm": 0.5988023281097412, + "learning_rate": 3.260265266334725e-07, + "loss": 0.1002, + "step": 588 + }, + { + "epoch": 5.166666666666667, + "grad_norm": 0.4078713059425354, + "learning_rate": 3.1941742499269764e-07, + "loss": 0.1002, + "step": 589 + }, + { + "epoch": 5.175438596491228, + "grad_norm": 0.9976629614830017, + "learning_rate": 3.128714257030882e-07, + "loss": 0.0982, + "step": 590 + }, + { + "epoch": 5.184210526315789, + "grad_norm": 0.7467443346977234, + "learning_rate": 3.063887181947334e-07, + "loss": 0.0978, + "step": 591 + }, + { + "epoch": 5.192982456140351, + "grad_norm": 0.46989375352859497, + "learning_rate": 2.9996949006616096e-07, + "loss": 0.099, + "step": 592 + }, + { + "epoch": 5.201754385964913, + "grad_norm": 0.6407843828201294, + "learning_rate": 2.9361392707891763e-07, + "loss": 0.1009, + "step": 593 + }, + { + "epoch": 5.2105263157894735, + "grad_norm": 0.5148762464523315, + "learning_rate": 2.8732221315218576e-07, + "loss": 0.1048, + "step": 594 + }, + { + "epoch": 5.219298245614035, + "grad_norm": 1.0204253196716309, + "learning_rate": 2.810945303574664e-07, + "loss": 0.1032, + "step": 595 + }, + { + "epoch": 5.228070175438597, + "grad_norm": 0.5452238321304321, + "learning_rate": 2.7493105891330837e-07, + "loss": 0.0987, + "step": 596 + }, + { + "epoch": 5.2368421052631575, + "grad_norm": 0.561916708946228, + "learning_rate": 2.688319771800929e-07, + "loss": 0.0972, + "step": 597 + }, + { + "epoch": 5.245614035087719, + "grad_norm": 0.4652751684188843, + "learning_rate": 2.6279746165487256e-07, + "loss": 0.0991, + "step": 598 + }, + { + "epoch": 5.254385964912281, + "grad_norm": 0.8166212439537048, + "learning_rate": 2.568276869662628e-07, + "loss": 0.0998, + "step": 599 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 0.5090087056159973, + "learning_rate": 2.5092282586939187e-07, + "loss": 0.1011, + "step": 600 + }, + { + "epoch": 5.271929824561403, + "grad_norm": 0.8435099720954895, + "learning_rate": 2.450830492408954e-07, + "loss": 0.1016, + "step": 601 + }, + { + "epoch": 5.280701754385965, + "grad_norm": 0.8541790843009949, + "learning_rate": 2.393085260739794e-07, + "loss": 0.1034, + "step": 602 + }, + { + "epoch": 5.2894736842105265, + "grad_norm": 0.7966872453689575, + "learning_rate": 2.3359942347352172e-07, + "loss": 0.0996, + "step": 603 + }, + { + "epoch": 5.298245614035087, + "grad_norm": 0.6361204981803894, + "learning_rate": 2.2795590665124267e-07, + "loss": 0.1007, + "step": 604 + }, + { + "epoch": 5.307017543859649, + "grad_norm": 0.4418005049228668, + "learning_rate": 2.2237813892092175e-07, + "loss": 0.0983, + "step": 605 + }, + { + "epoch": 5.315789473684211, + "grad_norm": 0.3627215623855591, + "learning_rate": 2.1686628169366923e-07, + "loss": 0.1016, + "step": 606 + }, + { + "epoch": 5.324561403508772, + "grad_norm": 0.6289935111999512, + "learning_rate": 2.114204944732609e-07, + "loss": 0.1024, + "step": 607 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.5116890668869019, + "learning_rate": 2.0604093485151548e-07, + "loss": 0.1005, + "step": 608 + }, + { + "epoch": 5.342105263157895, + "grad_norm": 0.344194620847702, + "learning_rate": 2.007277585037412e-07, + "loss": 0.1007, + "step": 609 + }, + { + "epoch": 5.350877192982456, + "grad_norm": 1.0403063297271729, + "learning_rate": 1.95481119184224e-07, + "loss": 0.1006, + "step": 610 + }, + { + "epoch": 5.359649122807017, + "grad_norm": 0.32791537046432495, + "learning_rate": 1.9030116872178317e-07, + "loss": 0.1007, + "step": 611 + }, + { + "epoch": 5.368421052631579, + "grad_norm": 0.6505579948425293, + "learning_rate": 1.851880570153755e-07, + "loss": 0.1049, + "step": 612 + }, + { + "epoch": 5.37719298245614, + "grad_norm": 0.726384162902832, + "learning_rate": 1.801419320297576e-07, + "loss": 0.1023, + "step": 613 + }, + { + "epoch": 5.385964912280702, + "grad_norm": 1.0476131439208984, + "learning_rate": 1.7516293979120525e-07, + "loss": 0.0984, + "step": 614 + }, + { + "epoch": 5.394736842105263, + "grad_norm": 0.8576235771179199, + "learning_rate": 1.7025122438328434e-07, + "loss": 0.1006, + "step": 615 + }, + { + "epoch": 5.4035087719298245, + "grad_norm": 0.2774132788181305, + "learning_rate": 1.654069279426873e-07, + "loss": 0.0986, + "step": 616 + }, + { + "epoch": 5.412280701754386, + "grad_norm": 0.43602442741394043, + "learning_rate": 1.6063019065511276e-07, + "loss": 0.0992, + "step": 617 + }, + { + "epoch": 5.421052631578947, + "grad_norm": 0.6421550512313843, + "learning_rate": 1.5592115075121512e-07, + "loss": 0.1017, + "step": 618 + }, + { + "epoch": 5.4298245614035086, + "grad_norm": 0.7895707488059998, + "learning_rate": 1.5127994450259976e-07, + "loss": 0.097, + "step": 619 + }, + { + "epoch": 5.43859649122807, + "grad_norm": 0.5679956674575806, + "learning_rate": 1.467067062178823e-07, + "loss": 0.0996, + "step": 620 + }, + { + "epoch": 5.447368421052632, + "grad_norm": 0.48801174759864807, + "learning_rate": 1.4220156823880144e-07, + "loss": 0.1034, + "step": 621 + }, + { + "epoch": 5.456140350877193, + "grad_norm": 0.4325696527957916, + "learning_rate": 1.3776466093638696e-07, + "loss": 0.0979, + "step": 622 + }, + { + "epoch": 5.464912280701754, + "grad_norm": 0.38854703307151794, + "learning_rate": 1.3339611270719198e-07, + "loss": 0.0998, + "step": 623 + }, + { + "epoch": 5.473684210526316, + "grad_norm": 0.698753833770752, + "learning_rate": 1.2909604996957093e-07, + "loss": 0.1008, + "step": 624 + }, + { + "epoch": 5.482456140350878, + "grad_norm": 0.6861230134963989, + "learning_rate": 1.2486459716002792e-07, + "loss": 0.1029, + "step": 625 + }, + { + "epoch": 5.491228070175438, + "grad_norm": 0.564124345779419, + "learning_rate": 1.2070187672960948e-07, + "loss": 0.1036, + "step": 626 + }, + { + "epoch": 5.5, + "grad_norm": 0.47016748785972595, + "learning_rate": 1.1660800914036568e-07, + "loss": 0.0999, + "step": 627 + }, + { + "epoch": 5.508771929824562, + "grad_norm": 0.6495513319969177, + "learning_rate": 1.1258311286186208e-07, + "loss": 0.0995, + "step": 628 + }, + { + "epoch": 5.517543859649123, + "grad_norm": 0.312717080116272, + "learning_rate": 1.086273043677516e-07, + "loss": 0.098, + "step": 629 + }, + { + "epoch": 5.526315789473684, + "grad_norm": 0.6478825211524963, + "learning_rate": 1.0474069813240505e-07, + "loss": 0.098, + "step": 630 + }, + { + "epoch": 5.535087719298246, + "grad_norm": 0.7767362594604492, + "learning_rate": 1.0092340662759548e-07, + "loss": 0.1022, + "step": 631 + }, + { + "epoch": 5.543859649122807, + "grad_norm": 0.5980598330497742, + "learning_rate": 9.717554031924842e-08, + "loss": 0.0977, + "step": 632 + }, + { + "epoch": 5.552631578947368, + "grad_norm": 0.7471850514411926, + "learning_rate": 9.349720766423931e-08, + "loss": 0.0991, + "step": 633 + }, + { + "epoch": 5.56140350877193, + "grad_norm": 0.48221901059150696, + "learning_rate": 8.988851510726093e-08, + "loss": 0.0985, + "step": 634 + }, + { + "epoch": 5.5701754385964914, + "grad_norm": 0.8782841563224792, + "learning_rate": 8.634956707773729e-08, + "loss": 0.1025, + "step": 635 + }, + { + "epoch": 5.578947368421053, + "grad_norm": 0.35953524708747864, + "learning_rate": 8.288046598680627e-08, + "loss": 0.1016, + "step": 636 + }, + { + "epoch": 5.587719298245614, + "grad_norm": 0.3914284408092499, + "learning_rate": 7.948131222435346e-08, + "loss": 0.096, + "step": 637 + }, + { + "epoch": 5.5964912280701755, + "grad_norm": 0.5373840928077698, + "learning_rate": 7.61522041561069e-08, + "loss": 0.1005, + "step": 638 + }, + { + "epoch": 5.605263157894737, + "grad_norm": 0.6877533197402954, + "learning_rate": 7.289323812079363e-08, + "loss": 0.0974, + "step": 639 + }, + { + "epoch": 5.614035087719298, + "grad_norm": 0.6217812299728394, + "learning_rate": 6.97045084273465e-08, + "loss": 0.0988, + "step": 640 + }, + { + "epoch": 5.62280701754386, + "grad_norm": 0.5998544692993164, + "learning_rate": 6.658610735218147e-08, + "loss": 0.101, + "step": 641 + }, + { + "epoch": 5.631578947368421, + "grad_norm": 0.44546636939048767, + "learning_rate": 6.353812513652052e-08, + "loss": 0.0993, + "step": 642 + }, + { + "epoch": 5.640350877192983, + "grad_norm": 0.5359933972358704, + "learning_rate": 6.056064998378658e-08, + "loss": 0.1039, + "step": 643 + }, + { + "epoch": 5.649122807017544, + "grad_norm": 0.45402801036834717, + "learning_rate": 5.7653768057045757e-08, + "loss": 0.1008, + "step": 644 + }, + { + "epoch": 5.657894736842105, + "grad_norm": 0.6362654566764832, + "learning_rate": 5.481756347651773e-08, + "loss": 0.0968, + "step": 645 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.3837541937828064, + "learning_rate": 5.205211831713935e-08, + "loss": 0.1001, + "step": 646 + }, + { + "epoch": 5.675438596491228, + "grad_norm": 0.4877745807170868, + "learning_rate": 4.935751260618987e-08, + "loss": 0.1021, + "step": 647 + }, + { + "epoch": 5.684210526315789, + "grad_norm": 0.5268471837043762, + "learning_rate": 4.6733824320976674e-08, + "loss": 0.1016, + "step": 648 + }, + { + "epoch": 5.692982456140351, + "grad_norm": 0.5390419363975525, + "learning_rate": 4.418112938657571e-08, + "loss": 0.1016, + "step": 649 + }, + { + "epoch": 5.701754385964913, + "grad_norm": 0.6146634221076965, + "learning_rate": 4.169950167363768e-08, + "loss": 0.0948, + "step": 650 + }, + { + "epoch": 5.7105263157894735, + "grad_norm": 0.5784945487976074, + "learning_rate": 3.928901299624782e-08, + "loss": 0.1007, + "step": 651 + }, + { + "epoch": 5.719298245614035, + "grad_norm": 0.8223549723625183, + "learning_rate": 3.6949733109848395e-08, + "loss": 0.1011, + "step": 652 + }, + { + "epoch": 5.728070175438597, + "grad_norm": 0.9502666592597961, + "learning_rate": 3.468172970922168e-08, + "loss": 0.102, + "step": 653 + }, + { + "epoch": 5.7368421052631575, + "grad_norm": 0.5113492608070374, + "learning_rate": 3.248506842652793e-08, + "loss": 0.101, + "step": 654 + }, + { + "epoch": 5.745614035087719, + "grad_norm": 1.0006201267242432, + "learning_rate": 3.0359812829409694e-08, + "loss": 0.0987, + "step": 655 + }, + { + "epoch": 5.754385964912281, + "grad_norm": 0.6877694129943848, + "learning_rate": 2.8306024419148814e-08, + "loss": 0.1003, + "step": 656 + }, + { + "epoch": 5.7631578947368425, + "grad_norm": 0.4734198749065399, + "learning_rate": 2.6323762628889804e-08, + "loss": 0.0975, + "step": 657 + }, + { + "epoch": 5.771929824561403, + "grad_norm": 0.8467719554901123, + "learning_rate": 2.4413084821916232e-08, + "loss": 0.0978, + "step": 658 + }, + { + "epoch": 5.780701754385965, + "grad_norm": 0.47460225224494934, + "learning_rate": 2.2574046289995933e-08, + "loss": 0.1001, + "step": 659 + }, + { + "epoch": 5.7894736842105265, + "grad_norm": 0.37792477011680603, + "learning_rate": 2.0806700251775057e-08, + "loss": 0.1002, + "step": 660 + }, + { + "epoch": 5.798245614035087, + "grad_norm": 0.7944504618644714, + "learning_rate": 1.9111097851242654e-08, + "loss": 0.0997, + "step": 661 + }, + { + "epoch": 5.807017543859649, + "grad_norm": 0.3530051112174988, + "learning_rate": 1.7487288156248782e-08, + "loss": 0.1021, + "step": 662 + }, + { + "epoch": 5.815789473684211, + "grad_norm": 0.6301564574241638, + "learning_rate": 1.593531815708371e-08, + "loss": 0.1023, + "step": 663 + }, + { + "epoch": 5.824561403508772, + "grad_norm": 0.5501565337181091, + "learning_rate": 1.4455232765120397e-08, + "loss": 0.1014, + "step": 664 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.5270814299583435, + "learning_rate": 1.3047074811512184e-08, + "loss": 0.099, + "step": 665 + }, + { + "epoch": 5.842105263157895, + "grad_norm": 0.6463411450386047, + "learning_rate": 1.1710885045956022e-08, + "loss": 0.0987, + "step": 666 + }, + { + "epoch": 5.850877192982456, + "grad_norm": 1.0232126712799072, + "learning_rate": 1.0446702135511188e-08, + "loss": 0.1017, + "step": 667 + }, + { + "epoch": 5.859649122807017, + "grad_norm": 0.3154284954071045, + "learning_rate": 9.25456266348046e-09, + "loss": 0.0921, + "step": 668 + }, + { + "epoch": 5.868421052631579, + "grad_norm": 0.7173347473144531, + "learning_rate": 8.134501128353456e-09, + "loss": 0.1007, + "step": 669 + }, + { + "epoch": 5.87719298245614, + "grad_norm": 0.6975192427635193, + "learning_rate": 7.086549942805499e-09, + "loss": 0.1031, + "step": 670 + }, + { + "epoch": 5.885964912280702, + "grad_norm": 0.7983221411705017, + "learning_rate": 6.110739432762247e-09, + "loss": 0.0991, + "step": 671 + }, + { + "epoch": 5.894736842105263, + "grad_norm": 0.8028814196586609, + "learning_rate": 5.20709783651957e-09, + "loss": 0.0942, + "step": 672 + }, + { + "epoch": 5.9035087719298245, + "grad_norm": 0.7531240582466125, + "learning_rate": 4.375651303928918e-09, + "loss": 0.1025, + "step": 673 + }, + { + "epoch": 5.912280701754386, + "grad_norm": 0.5777604579925537, + "learning_rate": 3.6164238956384878e-09, + "loss": 0.1, + "step": 674 + }, + { + "epoch": 5.921052631578947, + "grad_norm": 0.7759271264076233, + "learning_rate": 2.929437582398775e-09, + "loss": 0.0992, + "step": 675 + }, + { + "epoch": 5.9298245614035086, + "grad_norm": 0.38214001059532166, + "learning_rate": 2.3147122444250327e-09, + "loss": 0.1021, + "step": 676 + }, + { + "epoch": 5.93859649122807, + "grad_norm": 0.7428710460662842, + "learning_rate": 1.7722656708230034e-09, + "loss": 0.0989, + "step": 677 + }, + { + "epoch": 5.947368421052632, + "grad_norm": 0.6303841471672058, + "learning_rate": 1.3021135590740585e-09, + "loss": 0.0993, + "step": 678 + }, + { + "epoch": 5.956140350877193, + "grad_norm": 0.8671356439590454, + "learning_rate": 9.04269514580558e-10, + "loss": 0.1026, + "step": 679 + }, + { + "epoch": 5.964912280701754, + "grad_norm": 0.4996141195297241, + "learning_rate": 5.787450502728331e-10, + "loss": 0.1033, + "step": 680 + }, + { + "epoch": 5.973684210526316, + "grad_norm": 0.625603199005127, + "learning_rate": 3.255495862750091e-10, + "loss": 0.1038, + "step": 681 + }, + { + "epoch": 5.982456140350877, + "grad_norm": 0.674436628818512, + "learning_rate": 1.446904496335555e-10, + "loss": 0.0969, + "step": 682 + }, + { + "epoch": 5.991228070175438, + "grad_norm": 0.778946578502655, + "learning_rate": 3.6172874103845845e-11, + "loss": 0.1004, + "step": 683 + }, + { + "epoch": 6.0, + "grad_norm": 0.5384402275085449, + "learning_rate": 0.0, + "loss": 0.0971, + "step": 684 + } + ], + "logging_steps": 1, + "max_steps": 684, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 114, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.075994044489112e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-684/training_args.bin b/checkpoint-684/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..38c27bdabb0e0e68242bce9d9302628a34f6e7cf --- /dev/null +++ b/checkpoint-684/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cb0553c2c3dd5a010aed55eae3afd8bd7f096b43ba03d25af54dc26191426ae +size 7992 diff --git a/checkpoint-684/zero_to_fp32.py b/checkpoint-684/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-684/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b7324bbcd3035a34c2ac96f0e2a46dd94a5db25c --- /dev/null +++ b/config.json @@ -0,0 +1,1497 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "architectures": [ + "DeciLMForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "nvidia/Llama-3_3-Nemotron-Super-49B-v1--configuration_decilm.DeciLMConfig", + "AutoModelForCausalLM": "nvidia/Llama-3_3-Nemotron-Super-49B-v1--modeling_decilm.DeciLMForCausalLM" + }, + "block_configs": [ + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 2.625, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 2.625, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 2.625, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 3.28125, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.3125, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 2.625, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 2.625, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.3125, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.3125, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 2.625, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.3125, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.3125, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.3125, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.3125, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.0, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.0, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.3125, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.0, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.0, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.0, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.3125, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.3125, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 0.5, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 0.5, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.0, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.0, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 0.5, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 0.5, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 1.0, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 0.5, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": null, + "no_op": true, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 0.5, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + }, + { + "attention": { + "n_heads_in_group": 8, + "no_op": false, + "num_sink_tokens": null, + "replace_with_linear": false, + "sparsify": null, + "unshifted_sink": false, + "use_prefill_window_in_sink_attention": false, + "window_length": null + }, + "ffn": { + "ffn_mult": 5.25, + "no_op": false, + "replace_with_linear": false, + "sparsify": null + } + } + ], + "bos_token_id": 128000, + "eos_token_id": 128009, + "hidden_act": "silu", + "hidden_size": 8192, + "initializer_range": 0.02, + "intermediate_size": null, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "nemotron-nas", + "num_attention_heads": 64, + "num_hidden_layers": 80, + "num_key_value_heads": null, + "pretraining_tp": 1, + "quantization_config": { + "_load_in_4bit": true, + "_load_in_8bit": false, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_storage": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "llm_int8_enable_fp32_cpu_offload": false, + "llm_int8_has_fp16_weight": false, + "llm_int8_skip_modules": null, + "llm_int8_threshold": 6.0, + "load_in_4bit": true, + "load_in_8bit": false, + "quant_method": "bitsandbytes" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.49.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..edd01b980c1db496ea102a51c972ee8f5d1a2c74 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}{%- if messages[0]['role'] == 'system' %}{%- set system_message = messages[0]['content']|trim %}{%- set messages = messages[1:] %}{%- else %}{%- set system_message = \"\" %}{%- endif %}{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}{{- system_message }}{{- \"<|eot_id|>\" }}{%- for message in messages %}{%- if message['role'] == 'assistant' and '' in message['content'] %}{%- set content = message['content'].split('')[-1].lstrip() %}{%- else %}{%- set content = message['content'] %}{%- endif %}{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }}{%- endfor %}{%- if add_generation_prompt %}{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{%- endif %}", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +}