diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..721f2d0c53abfbaf4d56cbb67c73669aa495bc3c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-152/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-228/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-304/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-380/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-76/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..997faa1b89c230966ec24841fa4d3fbf3dcaedd6 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "v_proj", + "k_proj", + "down_proj", + "q_proj", + "up_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-152/README.md b/checkpoint-152/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fdf619c317c2fe82074662582dbd68166b6f9d50 --- /dev/null +++ b/checkpoint-152/README.md @@ -0,0 +1,202 @@ +--- +base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-152/adapter_config.json b/checkpoint-152/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a192388a7b55129be9ad9168abc396b47bbda6f7 --- /dev/null +++ b/checkpoint-152/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "o_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-152/adapter_model.safetensors b/checkpoint-152/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..37bf07bb5ac06c1eff8cfca5b99d2e928094699f --- /dev/null +++ b/checkpoint-152/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb0c61458020c30affe57a79c4c7d5f769fe8b936ffb7021ed5dab0c45b17a70 +size 10829849744 diff --git a/checkpoint-152/global_step152/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-152/global_step152/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9c833582c7cff5c7c9faa52143f7512a66f8b22 --- /dev/null +++ b/checkpoint-152/global_step152/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fe8afff7aaba765b6240a521193dbf428c406b06475e9129624dc50d67a5a52 +size 21659418140 diff --git a/checkpoint-152/global_step152/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-152/global_step152/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb62cd8861cc2766c3902b9071bca96f19b818bc --- /dev/null +++ b/checkpoint-152/global_step152/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:193cd1f2b2e47fd145128f3307fbe4114ddd4cfa36e3dd861e8291f98806a3df +size 21659457372 diff --git a/checkpoint-152/global_step152/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-152/global_step152/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d23c6471b732d4fe19254dda9e47c9990ee0693 --- /dev/null +++ b/checkpoint-152/global_step152/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b068efe3c1a280067ef6499a0a9f9e4d687740eaa33639912116d7b8738b9ea +size 21659417820 diff --git a/checkpoint-152/global_step152/mp_rank_00_model_states.pt b/checkpoint-152/global_step152/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7f86072d7cd8f79e7ab04fc7592779bbf75eb27 --- /dev/null +++ b/checkpoint-152/global_step152/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b08c67c92634fcdf05035a39e0631e5a64899934bd4781c74b05282f0571716 +size 11918643933 diff --git a/checkpoint-152/latest b/checkpoint-152/latest new file mode 100644 index 0000000000000000000000000000000000000000..60406aecd15beeaa730a071c614fe2ab5b4c734b --- /dev/null +++ b/checkpoint-152/latest @@ -0,0 +1 @@ +global_step152 \ No newline at end of file diff --git a/checkpoint-152/rng_state_0.pth b/checkpoint-152/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..1273774ecb9e0ba6283be0c2e8531a122e231d68 --- /dev/null +++ b/checkpoint-152/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd406425ef68395a3dcb05f97990b4ddc0a85ccc26e2550b978b0f0905f63fca +size 14768 diff --git a/checkpoint-152/rng_state_1.pth b/checkpoint-152/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7e626e64f550afe32c3368c8e040cd7056a74bb6 --- /dev/null +++ b/checkpoint-152/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:483a6993ec4e0cbec89e300d2a3bbeaf7fff23e01afc2457568a12aad958f9ac +size 14768 diff --git a/checkpoint-152/rng_state_2.pth b/checkpoint-152/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..1019e5921f219a14d8e09734eb68025ace867a77 --- /dev/null +++ b/checkpoint-152/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38f8203e92e896c8414d617110a8f97cc8e8be34d1aec495713321cbbe176d78 +size 14768 diff --git a/checkpoint-152/scheduler.pt b/checkpoint-152/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..92432b11da80f35807484df38118e1ccd8d23aaa --- /dev/null +++ b/checkpoint-152/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fa27436a75b1cdfdf2a48cad061d7983f71c2e5ca468127002dad296770375e +size 1064 diff --git a/checkpoint-152/special_tokens_map.json b/checkpoint-152/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-152/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-152/tokenizer.json b/checkpoint-152/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-152/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-152/tokenizer_config.json b/checkpoint-152/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fdde94c29816839ec3c29d6c6461206a49911f3c --- /dev/null +++ b/checkpoint-152/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-152/trainer_state.json b/checkpoint-152/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5021b354c42aae3511fd76c6ddacbdb31b31179a --- /dev/null +++ b/checkpoint-152/trainer_state.json @@ -0,0 +1,1097 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 152, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013157894736842105, + "grad_norm": 34.99433898925781, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.595, + "step": 1 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 35.6848258972168, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.6447, + "step": 2 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 35.07997512817383, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.5819, + "step": 3 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 34.3863525390625, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.5739, + "step": 4 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 35.443077087402344, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.6071, + "step": 5 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 34.70173263549805, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.5487, + "step": 6 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 34.421295166015625, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.5494, + "step": 7 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 35.152748107910156, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.5936, + "step": 8 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 34.947021484375, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.5574, + "step": 9 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 34.67315673828125, + "learning_rate": 5.000000000000001e-07, + "loss": 2.4894, + "step": 10 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 34.679954528808594, + "learning_rate": 5.5e-07, + "loss": 2.4985, + "step": 11 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 33.57002258300781, + "learning_rate": 6.000000000000001e-07, + "loss": 2.4339, + "step": 12 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 33.517276763916016, + "learning_rate": 6.5e-07, + "loss": 2.4055, + "step": 13 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 33.5312385559082, + "learning_rate": 7.000000000000001e-07, + "loss": 2.3806, + "step": 14 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 32.01276779174805, + "learning_rate": 7.5e-07, + "loss": 2.2505, + "step": 15 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 31.827980041503906, + "learning_rate": 8.000000000000001e-07, + "loss": 2.1359, + "step": 16 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 31.437101364135742, + "learning_rate": 8.500000000000001e-07, + "loss": 2.1117, + "step": 17 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 30.315187454223633, + "learning_rate": 9.000000000000001e-07, + "loss": 1.9795, + "step": 18 + }, + { + "epoch": 0.25, + "grad_norm": 29.622655868530273, + "learning_rate": 9.500000000000001e-07, + "loss": 1.8472, + "step": 19 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 28.628408432006836, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.7283, + "step": 20 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 27.83180046081543, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.5942, + "step": 21 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 26.911596298217773, + "learning_rate": 1.1e-06, + "loss": 1.4467, + "step": 22 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 25.88102149963379, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3007, + "step": 23 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 25.146381378173828, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.1319, + "step": 24 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 24.800382614135742, + "learning_rate": 1.25e-06, + "loss": 0.9359, + "step": 25 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 24.648332595825195, + "learning_rate": 1.3e-06, + "loss": 0.7054, + "step": 26 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 22.947620391845703, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.5209, + "step": 27 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 17.80010414123535, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.3546, + "step": 28 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 11.841789245605469, + "learning_rate": 1.45e-06, + "loss": 0.26, + "step": 29 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 7.275839805603027, + "learning_rate": 1.5e-06, + "loss": 0.1808, + "step": 30 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 4.6324543952941895, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1464, + "step": 31 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.1281485557556152, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1079, + "step": 32 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 2.062562942504883, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0966, + "step": 33 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 2.1343328952789307, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.088, + "step": 34 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 1.6768524646759033, + "learning_rate": 1.75e-06, + "loss": 0.0783, + "step": 35 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 1.0879229307174683, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0623, + "step": 36 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 0.83177649974823, + "learning_rate": 1.85e-06, + "loss": 0.0655, + "step": 37 + }, + { + "epoch": 0.5, + "grad_norm": 0.5678385496139526, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0565, + "step": 38 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 0.6994458436965942, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0491, + "step": 39 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.711387038230896, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0507, + "step": 40 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 0.7169735431671143, + "learning_rate": 2.05e-06, + "loss": 0.0478, + "step": 41 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.603631317615509, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0507, + "step": 42 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 0.617487907409668, + "learning_rate": 2.15e-06, + "loss": 0.043, + "step": 43 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.4638065993785858, + "learning_rate": 2.2e-06, + "loss": 0.0472, + "step": 44 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.5996385216712952, + "learning_rate": 2.25e-06, + "loss": 0.0429, + "step": 45 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.39118286967277527, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0421, + "step": 46 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 0.3118075728416443, + "learning_rate": 2.35e-06, + "loss": 0.0383, + "step": 47 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.31731992959976196, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.041, + "step": 48 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 0.5413194298744202, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0397, + "step": 49 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.32958006858825684, + "learning_rate": 2.5e-06, + "loss": 0.0355, + "step": 50 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 0.596309244632721, + "learning_rate": 2.55e-06, + "loss": 0.0413, + "step": 51 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.4557362496852875, + "learning_rate": 2.6e-06, + "loss": 0.0461, + "step": 52 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 0.3345410227775574, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0385, + "step": 53 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.3047848343849182, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0383, + "step": 54 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 0.43763449788093567, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.038, + "step": 55 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.26870036125183105, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0374, + "step": 56 + }, + { + "epoch": 0.75, + "grad_norm": 0.38762542605400085, + "learning_rate": 2.85e-06, + "loss": 0.0349, + "step": 57 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.27517396211624146, + "learning_rate": 2.9e-06, + "loss": 0.0398, + "step": 58 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 0.30815261602401733, + "learning_rate": 2.95e-06, + "loss": 0.0364, + "step": 59 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.30011361837387085, + "learning_rate": 3e-06, + "loss": 0.0307, + "step": 60 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 0.3269154727458954, + "learning_rate": 3.05e-06, + "loss": 0.0344, + "step": 61 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.3750869333744049, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0339, + "step": 62 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 0.29285815358161926, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.034, + "step": 63 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.4157550632953644, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0348, + "step": 64 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 0.2852867543697357, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0319, + "step": 65 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.4384031593799591, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0319, + "step": 66 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 0.4003254771232605, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0347, + "step": 67 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.49913832545280457, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0347, + "step": 68 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 0.22642269730567932, + "learning_rate": 3.45e-06, + "loss": 0.0306, + "step": 69 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.34004101157188416, + "learning_rate": 3.5e-06, + "loss": 0.0337, + "step": 70 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 0.21503636240959167, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0311, + "step": 71 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.33802086114883423, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0293, + "step": 72 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 0.2488064169883728, + "learning_rate": 3.65e-06, + "loss": 0.0318, + "step": 73 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.21124528348445892, + "learning_rate": 3.7e-06, + "loss": 0.0293, + "step": 74 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 0.3108712136745453, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0288, + "step": 75 + }, + { + "epoch": 1.0, + "grad_norm": 0.33483418822288513, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.031, + "step": 76 + }, + { + "epoch": 1.013157894736842, + "grad_norm": 0.3099130690097809, + "learning_rate": 3.85e-06, + "loss": 0.0286, + "step": 77 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 0.22946476936340332, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0307, + "step": 78 + }, + { + "epoch": 1.0394736842105263, + "grad_norm": 0.36924120783805847, + "learning_rate": 3.95e-06, + "loss": 0.0274, + "step": 79 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.30895617604255676, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0274, + "step": 80 + }, + { + "epoch": 1.0657894736842106, + "grad_norm": 0.42033568024635315, + "learning_rate": 4.05e-06, + "loss": 0.0298, + "step": 81 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 0.35573887825012207, + "learning_rate": 4.1e-06, + "loss": 0.0286, + "step": 82 + }, + { + "epoch": 1.0921052631578947, + "grad_norm": 0.24631913006305695, + "learning_rate": 4.15e-06, + "loss": 0.0294, + "step": 83 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 0.2908592224121094, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0243, + "step": 84 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 0.3293064832687378, + "learning_rate": 4.25e-06, + "loss": 0.0253, + "step": 85 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 0.3789626359939575, + "learning_rate": 4.3e-06, + "loss": 0.0253, + "step": 86 + }, + { + "epoch": 1.1447368421052633, + "grad_norm": 0.3900983929634094, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0248, + "step": 87 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.28972727060317993, + "learning_rate": 4.4e-06, + "loss": 0.0256, + "step": 88 + }, + { + "epoch": 1.1710526315789473, + "grad_norm": 0.4615432620048523, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0259, + "step": 89 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.3959222137928009, + "learning_rate": 4.5e-06, + "loss": 0.0277, + "step": 90 + }, + { + "epoch": 1.1973684210526316, + "grad_norm": 0.4927828907966614, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0251, + "step": 91 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.23854510486125946, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0214, + "step": 92 + }, + { + "epoch": 1.2236842105263157, + "grad_norm": 0.2470882534980774, + "learning_rate": 4.65e-06, + "loss": 0.0255, + "step": 93 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 0.22575952112674713, + "learning_rate": 4.7e-06, + "loss": 0.0208, + "step": 94 + }, + { + "epoch": 1.25, + "grad_norm": 0.437495619058609, + "learning_rate": 4.75e-06, + "loss": 0.0234, + "step": 95 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.2712303102016449, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0236, + "step": 96 + }, + { + "epoch": 1.2763157894736843, + "grad_norm": 0.2843461334705353, + "learning_rate": 4.85e-06, + "loss": 0.0195, + "step": 97 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 0.21141311526298523, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0223, + "step": 98 + }, + { + "epoch": 1.3026315789473684, + "grad_norm": 0.25484079122543335, + "learning_rate": 4.95e-06, + "loss": 0.0211, + "step": 99 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.358674556016922, + "learning_rate": 5e-06, + "loss": 0.027, + "step": 100 + }, + { + "epoch": 1.3289473684210527, + "grad_norm": 0.20442990958690643, + "learning_rate": 4.999902656502973e-06, + "loss": 0.0234, + "step": 101 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.2281407117843628, + "learning_rate": 4.9996106335924965e-06, + "loss": 0.0243, + "step": 102 + }, + { + "epoch": 1.3552631578947367, + "grad_norm": 0.23803724348545074, + "learning_rate": 4.999123954009797e-06, + "loss": 0.0189, + "step": 103 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.21493025124073029, + "learning_rate": 4.998442655654946e-06, + "loss": 0.0207, + "step": 104 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 0.2565159797668457, + "learning_rate": 4.997566791583916e-06, + "loss": 0.0178, + "step": 105 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 0.3488551378250122, + "learning_rate": 4.996496430004446e-06, + "loss": 0.0226, + "step": 106 + }, + { + "epoch": 1.4078947368421053, + "grad_norm": 0.27695611119270325, + "learning_rate": 4.995231654270726e-06, + "loss": 0.0189, + "step": 107 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.23477056622505188, + "learning_rate": 4.993772562876909e-06, + "loss": 0.0182, + "step": 108 + }, + { + "epoch": 1.4342105263157894, + "grad_norm": 0.22611404955387115, + "learning_rate": 4.992119269449445e-06, + "loss": 0.0168, + "step": 109 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.25616368651390076, + "learning_rate": 4.990271902738223e-06, + "loss": 0.022, + "step": 110 + }, + { + "epoch": 1.4605263157894737, + "grad_norm": 0.23842717707157135, + "learning_rate": 4.988230606606552e-06, + "loss": 0.0163, + "step": 111 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.24285905063152313, + "learning_rate": 4.985995540019956e-06, + "loss": 0.0202, + "step": 112 + }, + { + "epoch": 1.486842105263158, + "grad_norm": 0.24602730572223663, + "learning_rate": 4.983566877033791e-06, + "loss": 0.0173, + "step": 113 + }, + { + "epoch": 1.5, + "grad_norm": 0.26218464970588684, + "learning_rate": 4.980944806779698e-06, + "loss": 0.0206, + "step": 114 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 0.2999787926673889, + "learning_rate": 4.9781295334508664e-06, + "loss": 0.0178, + "step": 115 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.20500704646110535, + "learning_rate": 4.975121276286136e-06, + "loss": 0.0181, + "step": 116 + }, + { + "epoch": 1.5394736842105263, + "grad_norm": 0.25106561183929443, + "learning_rate": 4.9719202695529265e-06, + "loss": 0.0128, + "step": 117 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.2686936855316162, + "learning_rate": 4.968526762528988e-06, + "loss": 0.0146, + "step": 118 + }, + { + "epoch": 1.5657894736842106, + "grad_norm": 0.2770400047302246, + "learning_rate": 4.964941019482995e-06, + "loss": 0.0167, + "step": 119 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.27510589361190796, + "learning_rate": 4.961163319653959e-06, + "loss": 0.0162, + "step": 120 + }, + { + "epoch": 1.5921052631578947, + "grad_norm": 0.3720133602619171, + "learning_rate": 4.9571939572294914e-06, + "loss": 0.0163, + "step": 121 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.2288741022348404, + "learning_rate": 4.953033241322887e-06, + "loss": 0.0133, + "step": 122 + }, + { + "epoch": 1.618421052631579, + "grad_norm": 0.31084850430488586, + "learning_rate": 4.948681495949055e-06, + "loss": 0.0124, + "step": 123 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.19490985572338104, + "learning_rate": 4.944139059999286e-06, + "loss": 0.0114, + "step": 124 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 0.3074445426464081, + "learning_rate": 4.939406287214861e-06, + "loss": 0.0153, + "step": 125 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 0.29279908537864685, + "learning_rate": 4.9344835461595016e-06, + "loss": 0.0117, + "step": 126 + }, + { + "epoch": 1.6710526315789473, + "grad_norm": 0.3299407362937927, + "learning_rate": 4.929371220190671e-06, + "loss": 0.0128, + "step": 127 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.24818794429302216, + "learning_rate": 4.9240697074297205e-06, + "loss": 0.0146, + "step": 128 + }, + { + "epoch": 1.6973684210526314, + "grad_norm": 0.35983219742774963, + "learning_rate": 4.918579420730884e-06, + "loss": 0.0138, + "step": 129 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.2583932876586914, + "learning_rate": 4.912900787649124e-06, + "loss": 0.0136, + "step": 130 + }, + { + "epoch": 1.723684210526316, + "grad_norm": 0.20754319429397583, + "learning_rate": 4.907034250406846e-06, + "loss": 0.0116, + "step": 131 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.30609601736068726, + "learning_rate": 4.900980265859449e-06, + "loss": 0.0111, + "step": 132 + }, + { + "epoch": 1.75, + "grad_norm": 0.3754304349422455, + "learning_rate": 4.894739305459754e-06, + "loss": 0.0126, + "step": 133 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 0.2517055571079254, + "learning_rate": 4.88831185522129e-06, + "loss": 0.0118, + "step": 134 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 0.198478102684021, + "learning_rate": 4.881698415680442e-06, + "loss": 0.0087, + "step": 135 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.2307695895433426, + "learning_rate": 4.874899501857477e-06, + "loss": 0.0094, + "step": 136 + }, + { + "epoch": 1.8026315789473686, + "grad_norm": 0.17823486030101776, + "learning_rate": 4.867915643216434e-06, + "loss": 0.0098, + "step": 137 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.2157433032989502, + "learning_rate": 4.860747383623889e-06, + "loss": 0.0114, + "step": 138 + }, + { + "epoch": 1.8289473684210527, + "grad_norm": 0.21051311492919922, + "learning_rate": 4.85339528130661e-06, + "loss": 0.011, + "step": 139 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.32886555790901184, + "learning_rate": 4.845859908808074e-06, + "loss": 0.011, + "step": 140 + }, + { + "epoch": 1.8552631578947367, + "grad_norm": 0.22413378953933716, + "learning_rate": 4.838141852943891e-06, + "loss": 0.0087, + "step": 141 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.2896019518375397, + "learning_rate": 4.830241714756099e-06, + "loss": 0.011, + "step": 142 + }, + { + "epoch": 1.881578947368421, + "grad_norm": 0.26163023710250854, + "learning_rate": 4.822160109466361e-06, + "loss": 0.0084, + "step": 143 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.23998413980007172, + "learning_rate": 4.813897666428054e-06, + "loss": 0.0094, + "step": 144 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 0.2334728091955185, + "learning_rate": 4.805455029077255e-06, + "loss": 0.007, + "step": 145 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.17431940138339996, + "learning_rate": 4.79683285488264e-06, + "loss": 0.0047, + "step": 146 + }, + { + "epoch": 1.9342105263157894, + "grad_norm": 0.19151932001113892, + "learning_rate": 4.788031815294282e-06, + "loss": 0.0056, + "step": 147 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 0.2352588027715683, + "learning_rate": 4.779052595691355e-06, + "loss": 0.0107, + "step": 148 + }, + { + "epoch": 1.9605263157894737, + "grad_norm": 0.2848915159702301, + "learning_rate": 4.76989589532877e-06, + "loss": 0.0074, + "step": 149 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.218011736869812, + "learning_rate": 4.7605624272827125e-06, + "loss": 0.0075, + "step": 150 + }, + { + "epoch": 1.986842105263158, + "grad_norm": 0.3043143153190613, + "learning_rate": 4.75105291839512e-06, + "loss": 0.0073, + "step": 151 + }, + { + "epoch": 2.0, + "grad_norm": 0.16677772998809814, + "learning_rate": 4.741368109217072e-06, + "loss": 0.0065, + "step": 152 + } + ], + "logging_steps": 1, + "max_steps": 456, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 76, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.77843856062441e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-152/training_args.bin b/checkpoint-152/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5a1edbdcc63a93daa09112168cf20c0f8fcb7512 --- /dev/null +++ b/checkpoint-152/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:041cfaa5bf8383821dea4fa5a9d2eab2caad4644c4cd651398c8b0ab1541b270 +size 7992 diff --git a/checkpoint-152/zero_to_fp32.py b/checkpoint-152/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-152/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-228/README.md b/checkpoint-228/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fdf619c317c2fe82074662582dbd68166b6f9d50 --- /dev/null +++ b/checkpoint-228/README.md @@ -0,0 +1,202 @@ +--- +base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-228/adapter_config.json b/checkpoint-228/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a192388a7b55129be9ad9168abc396b47bbda6f7 --- /dev/null +++ b/checkpoint-228/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "o_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-228/adapter_model.safetensors b/checkpoint-228/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..552238fb19f5bee846efa646160c8a6de77c7fa5 --- /dev/null +++ b/checkpoint-228/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13ce11f4b2ce49bf748bc7fdaad8acb0700e9819541af009bb2f641b962cd27d +size 10829849744 diff --git a/checkpoint-228/global_step228/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-228/global_step228/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ade9d2a22111f8db8781f64e595d28131c276828 --- /dev/null +++ b/checkpoint-228/global_step228/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d37d6738b6c219466caf1821d05aa5edf4b0fd99e128743bb42259934efbcb77 +size 21659418140 diff --git a/checkpoint-228/global_step228/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-228/global_step228/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4050e7420030871334cc998bf4caf4cb9d8e21d9 --- /dev/null +++ b/checkpoint-228/global_step228/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a88511d54c9d9a9c220a6746b09ac106594ca8a7dd0b6a68c600206a4317f0bc +size 21659457372 diff --git a/checkpoint-228/global_step228/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-228/global_step228/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e732369059ca208581df3274ca7c58a47c6465c --- /dev/null +++ b/checkpoint-228/global_step228/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abc6d214d1e3892f0f1a4ba79c1f88979e993f1311f740deaa3fea6045a076a1 +size 21659417820 diff --git a/checkpoint-228/global_step228/mp_rank_00_model_states.pt b/checkpoint-228/global_step228/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..622707c19a9df03a293d46b1fde2a8c941a5ecff --- /dev/null +++ b/checkpoint-228/global_step228/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7569f86e5b8ad4531c06e5247ce8f5365b9ff384a45c41572a37f7795e769612 +size 11918643933 diff --git a/checkpoint-228/latest b/checkpoint-228/latest new file mode 100644 index 0000000000000000000000000000000000000000..74f667dd5aec7b1dcf458da255b4d04f2e864037 --- /dev/null +++ b/checkpoint-228/latest @@ -0,0 +1 @@ +global_step228 \ No newline at end of file diff --git a/checkpoint-228/rng_state_0.pth b/checkpoint-228/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..7e4e448cfbd2b0add7bf99082d4db1840a91b8ff --- /dev/null +++ b/checkpoint-228/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f597b8a8ff3fa0c1ca0852531a2c83f947d8ea6229f12dcf84cd40e9d2bdd735 +size 14768 diff --git a/checkpoint-228/rng_state_1.pth b/checkpoint-228/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..23d948eb6390eec22634357b14847f9feadb29dc --- /dev/null +++ b/checkpoint-228/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2630593d343fda4e989879bfd0f94abc55cf145925788b6d823f88bb73bfdfe +size 14768 diff --git a/checkpoint-228/rng_state_2.pth b/checkpoint-228/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..8ee328e2bdcbd65a0af107cef8782b2d2759fb7b --- /dev/null +++ b/checkpoint-228/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7138f65fc4992f28f481beb719c5f1191669f411d0001b0b14e2535745da64d +size 14768 diff --git a/checkpoint-228/scheduler.pt b/checkpoint-228/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..281433de4d85a441705e6ae0cdc6d3d9fb9482f3 --- /dev/null +++ b/checkpoint-228/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d7d1e7bc6044d2a4e5e390e5599228af42cccdc946da11715716db6eef73066 +size 1064 diff --git a/checkpoint-228/special_tokens_map.json b/checkpoint-228/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-228/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-228/tokenizer.json b/checkpoint-228/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-228/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-228/tokenizer_config.json b/checkpoint-228/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fdde94c29816839ec3c29d6c6461206a49911f3c --- /dev/null +++ b/checkpoint-228/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-228/trainer_state.json b/checkpoint-228/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8d6baf0d1c0b59d58524e279fb3843787c04fca6 --- /dev/null +++ b/checkpoint-228/trainer_state.json @@ -0,0 +1,1629 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 228, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013157894736842105, + "grad_norm": 34.99433898925781, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.595, + "step": 1 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 35.6848258972168, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.6447, + "step": 2 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 35.07997512817383, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.5819, + "step": 3 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 34.3863525390625, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.5739, + "step": 4 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 35.443077087402344, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.6071, + "step": 5 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 34.70173263549805, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.5487, + "step": 6 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 34.421295166015625, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.5494, + "step": 7 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 35.152748107910156, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.5936, + "step": 8 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 34.947021484375, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.5574, + "step": 9 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 34.67315673828125, + "learning_rate": 5.000000000000001e-07, + "loss": 2.4894, + "step": 10 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 34.679954528808594, + "learning_rate": 5.5e-07, + "loss": 2.4985, + "step": 11 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 33.57002258300781, + "learning_rate": 6.000000000000001e-07, + "loss": 2.4339, + "step": 12 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 33.517276763916016, + "learning_rate": 6.5e-07, + "loss": 2.4055, + "step": 13 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 33.5312385559082, + "learning_rate": 7.000000000000001e-07, + "loss": 2.3806, + "step": 14 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 32.01276779174805, + "learning_rate": 7.5e-07, + "loss": 2.2505, + "step": 15 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 31.827980041503906, + "learning_rate": 8.000000000000001e-07, + "loss": 2.1359, + "step": 16 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 31.437101364135742, + "learning_rate": 8.500000000000001e-07, + "loss": 2.1117, + "step": 17 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 30.315187454223633, + "learning_rate": 9.000000000000001e-07, + "loss": 1.9795, + "step": 18 + }, + { + "epoch": 0.25, + "grad_norm": 29.622655868530273, + "learning_rate": 9.500000000000001e-07, + "loss": 1.8472, + "step": 19 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 28.628408432006836, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.7283, + "step": 20 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 27.83180046081543, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.5942, + "step": 21 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 26.911596298217773, + "learning_rate": 1.1e-06, + "loss": 1.4467, + "step": 22 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 25.88102149963379, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3007, + "step": 23 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 25.146381378173828, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.1319, + "step": 24 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 24.800382614135742, + "learning_rate": 1.25e-06, + "loss": 0.9359, + "step": 25 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 24.648332595825195, + "learning_rate": 1.3e-06, + "loss": 0.7054, + "step": 26 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 22.947620391845703, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.5209, + "step": 27 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 17.80010414123535, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.3546, + "step": 28 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 11.841789245605469, + "learning_rate": 1.45e-06, + "loss": 0.26, + "step": 29 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 7.275839805603027, + "learning_rate": 1.5e-06, + "loss": 0.1808, + "step": 30 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 4.6324543952941895, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1464, + "step": 31 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.1281485557556152, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1079, + "step": 32 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 2.062562942504883, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0966, + "step": 33 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 2.1343328952789307, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.088, + "step": 34 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 1.6768524646759033, + "learning_rate": 1.75e-06, + "loss": 0.0783, + "step": 35 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 1.0879229307174683, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0623, + "step": 36 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 0.83177649974823, + "learning_rate": 1.85e-06, + "loss": 0.0655, + "step": 37 + }, + { + "epoch": 0.5, + "grad_norm": 0.5678385496139526, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0565, + "step": 38 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 0.6994458436965942, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0491, + "step": 39 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.711387038230896, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0507, + "step": 40 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 0.7169735431671143, + "learning_rate": 2.05e-06, + "loss": 0.0478, + "step": 41 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.603631317615509, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0507, + "step": 42 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 0.617487907409668, + "learning_rate": 2.15e-06, + "loss": 0.043, + "step": 43 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.4638065993785858, + "learning_rate": 2.2e-06, + "loss": 0.0472, + "step": 44 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.5996385216712952, + "learning_rate": 2.25e-06, + "loss": 0.0429, + "step": 45 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.39118286967277527, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0421, + "step": 46 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 0.3118075728416443, + "learning_rate": 2.35e-06, + "loss": 0.0383, + "step": 47 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.31731992959976196, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.041, + "step": 48 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 0.5413194298744202, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0397, + "step": 49 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.32958006858825684, + "learning_rate": 2.5e-06, + "loss": 0.0355, + "step": 50 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 0.596309244632721, + "learning_rate": 2.55e-06, + "loss": 0.0413, + "step": 51 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.4557362496852875, + "learning_rate": 2.6e-06, + "loss": 0.0461, + "step": 52 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 0.3345410227775574, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0385, + "step": 53 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.3047848343849182, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0383, + "step": 54 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 0.43763449788093567, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.038, + "step": 55 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.26870036125183105, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0374, + "step": 56 + }, + { + "epoch": 0.75, + "grad_norm": 0.38762542605400085, + "learning_rate": 2.85e-06, + "loss": 0.0349, + "step": 57 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.27517396211624146, + "learning_rate": 2.9e-06, + "loss": 0.0398, + "step": 58 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 0.30815261602401733, + "learning_rate": 2.95e-06, + "loss": 0.0364, + "step": 59 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.30011361837387085, + "learning_rate": 3e-06, + "loss": 0.0307, + "step": 60 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 0.3269154727458954, + "learning_rate": 3.05e-06, + "loss": 0.0344, + "step": 61 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.3750869333744049, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0339, + "step": 62 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 0.29285815358161926, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.034, + "step": 63 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.4157550632953644, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0348, + "step": 64 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 0.2852867543697357, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0319, + "step": 65 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.4384031593799591, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0319, + "step": 66 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 0.4003254771232605, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0347, + "step": 67 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.49913832545280457, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0347, + "step": 68 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 0.22642269730567932, + "learning_rate": 3.45e-06, + "loss": 0.0306, + "step": 69 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.34004101157188416, + "learning_rate": 3.5e-06, + "loss": 0.0337, + "step": 70 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 0.21503636240959167, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0311, + "step": 71 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.33802086114883423, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0293, + "step": 72 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 0.2488064169883728, + "learning_rate": 3.65e-06, + "loss": 0.0318, + "step": 73 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.21124528348445892, + "learning_rate": 3.7e-06, + "loss": 0.0293, + "step": 74 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 0.3108712136745453, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0288, + "step": 75 + }, + { + "epoch": 1.0, + "grad_norm": 0.33483418822288513, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.031, + "step": 76 + }, + { + "epoch": 1.013157894736842, + "grad_norm": 0.3099130690097809, + "learning_rate": 3.85e-06, + "loss": 0.0286, + "step": 77 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 0.22946476936340332, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0307, + "step": 78 + }, + { + "epoch": 1.0394736842105263, + "grad_norm": 0.36924120783805847, + "learning_rate": 3.95e-06, + "loss": 0.0274, + "step": 79 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.30895617604255676, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0274, + "step": 80 + }, + { + "epoch": 1.0657894736842106, + "grad_norm": 0.42033568024635315, + "learning_rate": 4.05e-06, + "loss": 0.0298, + "step": 81 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 0.35573887825012207, + "learning_rate": 4.1e-06, + "loss": 0.0286, + "step": 82 + }, + { + "epoch": 1.0921052631578947, + "grad_norm": 0.24631913006305695, + "learning_rate": 4.15e-06, + "loss": 0.0294, + "step": 83 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 0.2908592224121094, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0243, + "step": 84 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 0.3293064832687378, + "learning_rate": 4.25e-06, + "loss": 0.0253, + "step": 85 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 0.3789626359939575, + "learning_rate": 4.3e-06, + "loss": 0.0253, + "step": 86 + }, + { + "epoch": 1.1447368421052633, + "grad_norm": 0.3900983929634094, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0248, + "step": 87 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.28972727060317993, + "learning_rate": 4.4e-06, + "loss": 0.0256, + "step": 88 + }, + { + "epoch": 1.1710526315789473, + "grad_norm": 0.4615432620048523, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0259, + "step": 89 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.3959222137928009, + "learning_rate": 4.5e-06, + "loss": 0.0277, + "step": 90 + }, + { + "epoch": 1.1973684210526316, + "grad_norm": 0.4927828907966614, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0251, + "step": 91 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.23854510486125946, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0214, + "step": 92 + }, + { + "epoch": 1.2236842105263157, + "grad_norm": 0.2470882534980774, + "learning_rate": 4.65e-06, + "loss": 0.0255, + "step": 93 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 0.22575952112674713, + "learning_rate": 4.7e-06, + "loss": 0.0208, + "step": 94 + }, + { + "epoch": 1.25, + "grad_norm": 0.437495619058609, + "learning_rate": 4.75e-06, + "loss": 0.0234, + "step": 95 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.2712303102016449, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0236, + "step": 96 + }, + { + "epoch": 1.2763157894736843, + "grad_norm": 0.2843461334705353, + "learning_rate": 4.85e-06, + "loss": 0.0195, + "step": 97 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 0.21141311526298523, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0223, + "step": 98 + }, + { + "epoch": 1.3026315789473684, + "grad_norm": 0.25484079122543335, + "learning_rate": 4.95e-06, + "loss": 0.0211, + "step": 99 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.358674556016922, + "learning_rate": 5e-06, + "loss": 0.027, + "step": 100 + }, + { + "epoch": 1.3289473684210527, + "grad_norm": 0.20442990958690643, + "learning_rate": 4.999902656502973e-06, + "loss": 0.0234, + "step": 101 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.2281407117843628, + "learning_rate": 4.9996106335924965e-06, + "loss": 0.0243, + "step": 102 + }, + { + "epoch": 1.3552631578947367, + "grad_norm": 0.23803724348545074, + "learning_rate": 4.999123954009797e-06, + "loss": 0.0189, + "step": 103 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.21493025124073029, + "learning_rate": 4.998442655654946e-06, + "loss": 0.0207, + "step": 104 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 0.2565159797668457, + "learning_rate": 4.997566791583916e-06, + "loss": 0.0178, + "step": 105 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 0.3488551378250122, + "learning_rate": 4.996496430004446e-06, + "loss": 0.0226, + "step": 106 + }, + { + "epoch": 1.4078947368421053, + "grad_norm": 0.27695611119270325, + "learning_rate": 4.995231654270726e-06, + "loss": 0.0189, + "step": 107 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.23477056622505188, + "learning_rate": 4.993772562876909e-06, + "loss": 0.0182, + "step": 108 + }, + { + "epoch": 1.4342105263157894, + "grad_norm": 0.22611404955387115, + "learning_rate": 4.992119269449445e-06, + "loss": 0.0168, + "step": 109 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.25616368651390076, + "learning_rate": 4.990271902738223e-06, + "loss": 0.022, + "step": 110 + }, + { + "epoch": 1.4605263157894737, + "grad_norm": 0.23842717707157135, + "learning_rate": 4.988230606606552e-06, + "loss": 0.0163, + "step": 111 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.24285905063152313, + "learning_rate": 4.985995540019956e-06, + "loss": 0.0202, + "step": 112 + }, + { + "epoch": 1.486842105263158, + "grad_norm": 0.24602730572223663, + "learning_rate": 4.983566877033791e-06, + "loss": 0.0173, + "step": 113 + }, + { + "epoch": 1.5, + "grad_norm": 0.26218464970588684, + "learning_rate": 4.980944806779698e-06, + "loss": 0.0206, + "step": 114 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 0.2999787926673889, + "learning_rate": 4.9781295334508664e-06, + "loss": 0.0178, + "step": 115 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.20500704646110535, + "learning_rate": 4.975121276286136e-06, + "loss": 0.0181, + "step": 116 + }, + { + "epoch": 1.5394736842105263, + "grad_norm": 0.25106561183929443, + "learning_rate": 4.9719202695529265e-06, + "loss": 0.0128, + "step": 117 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.2686936855316162, + "learning_rate": 4.968526762528988e-06, + "loss": 0.0146, + "step": 118 + }, + { + "epoch": 1.5657894736842106, + "grad_norm": 0.2770400047302246, + "learning_rate": 4.964941019482995e-06, + "loss": 0.0167, + "step": 119 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.27510589361190796, + "learning_rate": 4.961163319653959e-06, + "loss": 0.0162, + "step": 120 + }, + { + "epoch": 1.5921052631578947, + "grad_norm": 0.3720133602619171, + "learning_rate": 4.9571939572294914e-06, + "loss": 0.0163, + "step": 121 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.2288741022348404, + "learning_rate": 4.953033241322887e-06, + "loss": 0.0133, + "step": 122 + }, + { + "epoch": 1.618421052631579, + "grad_norm": 0.31084850430488586, + "learning_rate": 4.948681495949055e-06, + "loss": 0.0124, + "step": 123 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.19490985572338104, + "learning_rate": 4.944139059999286e-06, + "loss": 0.0114, + "step": 124 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 0.3074445426464081, + "learning_rate": 4.939406287214861e-06, + "loss": 0.0153, + "step": 125 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 0.29279908537864685, + "learning_rate": 4.9344835461595016e-06, + "loss": 0.0117, + "step": 126 + }, + { + "epoch": 1.6710526315789473, + "grad_norm": 0.3299407362937927, + "learning_rate": 4.929371220190671e-06, + "loss": 0.0128, + "step": 127 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.24818794429302216, + "learning_rate": 4.9240697074297205e-06, + "loss": 0.0146, + "step": 128 + }, + { + "epoch": 1.6973684210526314, + "grad_norm": 0.35983219742774963, + "learning_rate": 4.918579420730884e-06, + "loss": 0.0138, + "step": 129 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.2583932876586914, + "learning_rate": 4.912900787649124e-06, + "loss": 0.0136, + "step": 130 + }, + { + "epoch": 1.723684210526316, + "grad_norm": 0.20754319429397583, + "learning_rate": 4.907034250406846e-06, + "loss": 0.0116, + "step": 131 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.30609601736068726, + "learning_rate": 4.900980265859449e-06, + "loss": 0.0111, + "step": 132 + }, + { + "epoch": 1.75, + "grad_norm": 0.3754304349422455, + "learning_rate": 4.894739305459754e-06, + "loss": 0.0126, + "step": 133 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 0.2517055571079254, + "learning_rate": 4.88831185522129e-06, + "loss": 0.0118, + "step": 134 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 0.198478102684021, + "learning_rate": 4.881698415680442e-06, + "loss": 0.0087, + "step": 135 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.2307695895433426, + "learning_rate": 4.874899501857477e-06, + "loss": 0.0094, + "step": 136 + }, + { + "epoch": 1.8026315789473686, + "grad_norm": 0.17823486030101776, + "learning_rate": 4.867915643216434e-06, + "loss": 0.0098, + "step": 137 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.2157433032989502, + "learning_rate": 4.860747383623889e-06, + "loss": 0.0114, + "step": 138 + }, + { + "epoch": 1.8289473684210527, + "grad_norm": 0.21051311492919922, + "learning_rate": 4.85339528130661e-06, + "loss": 0.011, + "step": 139 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.32886555790901184, + "learning_rate": 4.845859908808074e-06, + "loss": 0.011, + "step": 140 + }, + { + "epoch": 1.8552631578947367, + "grad_norm": 0.22413378953933716, + "learning_rate": 4.838141852943891e-06, + "loss": 0.0087, + "step": 141 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.2896019518375397, + "learning_rate": 4.830241714756099e-06, + "loss": 0.011, + "step": 142 + }, + { + "epoch": 1.881578947368421, + "grad_norm": 0.26163023710250854, + "learning_rate": 4.822160109466361e-06, + "loss": 0.0084, + "step": 143 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.23998413980007172, + "learning_rate": 4.813897666428054e-06, + "loss": 0.0094, + "step": 144 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 0.2334728091955185, + "learning_rate": 4.805455029077255e-06, + "loss": 0.007, + "step": 145 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.17431940138339996, + "learning_rate": 4.79683285488264e-06, + "loss": 0.0047, + "step": 146 + }, + { + "epoch": 1.9342105263157894, + "grad_norm": 0.19151932001113892, + "learning_rate": 4.788031815294282e-06, + "loss": 0.0056, + "step": 147 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 0.2352588027715683, + "learning_rate": 4.779052595691355e-06, + "loss": 0.0107, + "step": 148 + }, + { + "epoch": 1.9605263157894737, + "grad_norm": 0.2848915159702301, + "learning_rate": 4.76989589532877e-06, + "loss": 0.0074, + "step": 149 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.218011736869812, + "learning_rate": 4.7605624272827125e-06, + "loss": 0.0075, + "step": 150 + }, + { + "epoch": 1.986842105263158, + "grad_norm": 0.3043143153190613, + "learning_rate": 4.75105291839512e-06, + "loss": 0.0073, + "step": 151 + }, + { + "epoch": 2.0, + "grad_norm": 0.16677772998809814, + "learning_rate": 4.741368109217072e-06, + "loss": 0.0065, + "step": 152 + }, + { + "epoch": 2.013157894736842, + "grad_norm": 0.14940837025642395, + "learning_rate": 4.7315087539511225e-06, + "loss": 0.0034, + "step": 153 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 0.14960654079914093, + "learning_rate": 4.721475620392567e-06, + "loss": 0.0034, + "step": 154 + }, + { + "epoch": 2.039473684210526, + "grad_norm": 0.2261868566274643, + "learning_rate": 4.711269489869654e-06, + "loss": 0.0034, + "step": 155 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 0.20907555520534515, + "learning_rate": 4.700891157182729e-06, + "loss": 0.0045, + "step": 156 + }, + { + "epoch": 2.0657894736842106, + "grad_norm": 0.15571005642414093, + "learning_rate": 4.690341430542351e-06, + "loss": 0.0032, + "step": 157 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 0.16968725621700287, + "learning_rate": 4.679621131506347e-06, + "loss": 0.0044, + "step": 158 + }, + { + "epoch": 2.0921052631578947, + "grad_norm": 0.1937742531299591, + "learning_rate": 4.668731094915835e-06, + "loss": 0.0027, + "step": 159 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.14914007484912872, + "learning_rate": 4.657672168830211e-06, + "loss": 0.0031, + "step": 160 + }, + { + "epoch": 2.1184210526315788, + "grad_norm": 0.19651293754577637, + "learning_rate": 4.646445214461105e-06, + "loss": 0.0043, + "step": 161 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 0.2023143172264099, + "learning_rate": 4.635051106105316e-06, + "loss": 0.0036, + "step": 162 + }, + { + "epoch": 2.1447368421052633, + "grad_norm": 0.17952999472618103, + "learning_rate": 4.623490731076728e-06, + "loss": 0.0024, + "step": 163 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.18410134315490723, + "learning_rate": 4.6117649896372055e-06, + "loss": 0.0054, + "step": 164 + }, + { + "epoch": 2.1710526315789473, + "grad_norm": 0.18808087706565857, + "learning_rate": 4.59987479492649e-06, + "loss": 0.0039, + "step": 165 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 0.12346187978982925, + "learning_rate": 4.587821072891089e-06, + "loss": 0.0036, + "step": 166 + }, + { + "epoch": 2.1973684210526314, + "grad_norm": 0.140532448887825, + "learning_rate": 4.5756047622121665e-06, + "loss": 0.0028, + "step": 167 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.8201041221618652, + "learning_rate": 4.563226814232444e-06, + "loss": 0.0028, + "step": 168 + }, + { + "epoch": 2.223684210526316, + "grad_norm": 0.26919177174568176, + "learning_rate": 4.550688192882115e-06, + "loss": 0.0032, + "step": 169 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.1321878731250763, + "learning_rate": 4.53798987460378e-06, + "loss": 0.0023, + "step": 170 + }, + { + "epoch": 2.25, + "grad_norm": 0.12545251846313477, + "learning_rate": 4.525132848276405e-06, + "loss": 0.0024, + "step": 171 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 0.1377445012331009, + "learning_rate": 4.512118115138315e-06, + "loss": 0.0033, + "step": 172 + }, + { + "epoch": 2.276315789473684, + "grad_norm": 0.10942364484071732, + "learning_rate": 4.498946688709216e-06, + "loss": 0.0023, + "step": 173 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 0.17425717413425446, + "learning_rate": 4.485619594711278e-06, + "loss": 0.003, + "step": 174 + }, + { + "epoch": 2.3026315789473686, + "grad_norm": 0.15876342356204987, + "learning_rate": 4.4721378709892475e-06, + "loss": 0.0034, + "step": 175 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.1537884920835495, + "learning_rate": 4.4585025674296315e-06, + "loss": 0.002, + "step": 176 + }, + { + "epoch": 2.3289473684210527, + "grad_norm": 0.13558532297611237, + "learning_rate": 4.444714745878936e-06, + "loss": 0.0021, + "step": 177 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 0.14405666291713715, + "learning_rate": 4.430775480060973e-06, + "loss": 0.0028, + "step": 178 + }, + { + "epoch": 2.3552631578947367, + "grad_norm": 0.19296719133853912, + "learning_rate": 4.416685855493246e-06, + "loss": 0.0034, + "step": 179 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.2153935730457306, + "learning_rate": 4.4024469694024194e-06, + "loss": 0.0032, + "step": 180 + }, + { + "epoch": 2.3815789473684212, + "grad_norm": 0.11674188822507858, + "learning_rate": 4.388059930638865e-06, + "loss": 0.0013, + "step": 181 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 0.21349935233592987, + "learning_rate": 4.373525859590313e-06, + "loss": 0.002, + "step": 182 + }, + { + "epoch": 2.4078947368421053, + "grad_norm": 0.16676126420497894, + "learning_rate": 4.358845888094607e-06, + "loss": 0.0015, + "step": 183 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.20975975692272186, + "learning_rate": 4.3440211593515556e-06, + "loss": 0.0025, + "step": 184 + }, + { + "epoch": 2.4342105263157894, + "grad_norm": 0.3014683425426483, + "learning_rate": 4.32905282783391e-06, + "loss": 0.0031, + "step": 185 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 0.1687438040971756, + "learning_rate": 4.313942059197457e-06, + "loss": 0.0014, + "step": 186 + }, + { + "epoch": 2.4605263157894735, + "grad_norm": 0.13351179659366608, + "learning_rate": 4.298690030190247e-06, + "loss": 0.0012, + "step": 187 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 0.4079025387763977, + "learning_rate": 4.283297928560951e-06, + "loss": 0.0026, + "step": 188 + }, + { + "epoch": 2.486842105263158, + "grad_norm": 0.12639036774635315, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0017, + "step": 189 + }, + { + "epoch": 2.5, + "grad_norm": 0.1551010012626648, + "learning_rate": 4.252098312878083e-06, + "loss": 0.0022, + "step": 190 + }, + { + "epoch": 2.513157894736842, + "grad_norm": 0.1431741863489151, + "learning_rate": 4.236293228488267e-06, + "loss": 0.0022, + "step": 191 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.39600786566734314, + "learning_rate": 4.220352930614672e-06, + "loss": 0.0031, + "step": 192 + }, + { + "epoch": 2.5394736842105265, + "grad_norm": 0.13951376080513, + "learning_rate": 4.204278660604767e-06, + "loss": 0.0016, + "step": 193 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 0.10893042385578156, + "learning_rate": 4.1880716702390764e-06, + "loss": 0.0007, + "step": 194 + }, + { + "epoch": 2.5657894736842106, + "grad_norm": 0.16801239550113678, + "learning_rate": 4.171733221633695e-06, + "loss": 0.002, + "step": 195 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 0.11393369734287262, + "learning_rate": 4.155264587142002e-06, + "loss": 0.0017, + "step": 196 + }, + { + "epoch": 2.5921052631578947, + "grad_norm": 0.23128700256347656, + "learning_rate": 4.138667049255574e-06, + "loss": 0.0018, + "step": 197 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 0.06730300188064575, + "learning_rate": 4.121941900504316e-06, + "loss": 0.0006, + "step": 198 + }, + { + "epoch": 2.6184210526315788, + "grad_norm": 0.11693810671567917, + "learning_rate": 4.105090443355801e-06, + "loss": 0.0012, + "step": 199 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.1186346486210823, + "learning_rate": 4.088113990113846e-06, + "loss": 0.0012, + "step": 200 + }, + { + "epoch": 2.6447368421052633, + "grad_norm": 0.2584531605243683, + "learning_rate": 4.071013862816311e-06, + "loss": 0.0025, + "step": 201 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 0.09868124127388, + "learning_rate": 4.0537913931321495e-06, + "loss": 0.0017, + "step": 202 + }, + { + "epoch": 2.6710526315789473, + "grad_norm": 0.09907737374305725, + "learning_rate": 4.036447922257699e-06, + "loss": 0.002, + "step": 203 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.06743597984313965, + "learning_rate": 4.018984800812248e-06, + "loss": 0.0006, + "step": 204 + }, + { + "epoch": 2.6973684210526314, + "grad_norm": 0.08913715183734894, + "learning_rate": 4.001403388732842e-06, + "loss": 0.0007, + "step": 205 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 0.12334564328193665, + "learning_rate": 3.983705055168391e-06, + "loss": 0.0006, + "step": 206 + }, + { + "epoch": 2.723684210526316, + "grad_norm": 0.10878646373748779, + "learning_rate": 3.965891178373038e-06, + "loss": 0.0016, + "step": 207 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.10623253136873245, + "learning_rate": 3.947963145598833e-06, + "loss": 0.0015, + "step": 208 + }, + { + "epoch": 2.75, + "grad_norm": 0.15580499172210693, + "learning_rate": 3.929922352987702e-06, + "loss": 0.0011, + "step": 209 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.06405838578939438, + "learning_rate": 3.911770205462717e-06, + "loss": 0.0007, + "step": 210 + }, + { + "epoch": 2.776315789473684, + "grad_norm": 0.17784689366817474, + "learning_rate": 3.8935081166186935e-06, + "loss": 0.0017, + "step": 211 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 0.14516866207122803, + "learning_rate": 3.875137508612104e-06, + "loss": 0.0014, + "step": 212 + }, + { + "epoch": 2.8026315789473686, + "grad_norm": 0.09510776400566101, + "learning_rate": 3.856659812050328e-06, + "loss": 0.0009, + "step": 213 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 0.1000828891992569, + "learning_rate": 3.838076465880248e-06, + "loss": 0.0008, + "step": 214 + }, + { + "epoch": 2.8289473684210527, + "grad_norm": 0.10773428529500961, + "learning_rate": 3.819388917276186e-06, + "loss": 0.0008, + "step": 215 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.12319213151931763, + "learning_rate": 3.8005986215272056e-06, + "loss": 0.0007, + "step": 216 + }, + { + "epoch": 2.8552631578947367, + "grad_norm": 0.07209170609712601, + "learning_rate": 3.7817070419237866e-06, + "loss": 0.0006, + "step": 217 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 0.12889248132705688, + "learning_rate": 3.7627156496438686e-06, + "loss": 0.0005, + "step": 218 + }, + { + "epoch": 2.8815789473684212, + "grad_norm": 0.05019540339708328, + "learning_rate": 3.7436259236382797e-06, + "loss": 0.0003, + "step": 219 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.10657542198896408, + "learning_rate": 3.7244393505155713e-06, + "loss": 0.0008, + "step": 220 + }, + { + "epoch": 2.9078947368421053, + "grad_norm": 0.15984083712100983, + "learning_rate": 3.7051574244262412e-06, + "loss": 0.0003, + "step": 221 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 0.1567343920469284, + "learning_rate": 3.6857816469463806e-06, + "loss": 0.0005, + "step": 222 + }, + { + "epoch": 2.9342105263157894, + "grad_norm": 0.07294822484254837, + "learning_rate": 3.6663135269607413e-06, + "loss": 0.0006, + "step": 223 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.04486699402332306, + "learning_rate": 3.6467545805452266e-06, + "loss": 0.0003, + "step": 224 + }, + { + "epoch": 2.9605263157894735, + "grad_norm": 0.08188032358884811, + "learning_rate": 3.6271063308488298e-06, + "loss": 0.0004, + "step": 225 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 0.029995013028383255, + "learning_rate": 3.6073703079750204e-06, + "loss": 0.0002, + "step": 226 + }, + { + "epoch": 2.986842105263158, + "grad_norm": 0.029155094176530838, + "learning_rate": 3.5875480488625847e-06, + "loss": 0.0002, + "step": 227 + }, + { + "epoch": 3.0, + "grad_norm": 0.058334361761808395, + "learning_rate": 3.5676410971659404e-06, + "loss": 0.0003, + "step": 228 + } + ], + "logging_steps": 1, + "max_steps": 456, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 76, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4667657840936616e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-228/training_args.bin b/checkpoint-228/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5a1edbdcc63a93daa09112168cf20c0f8fcb7512 --- /dev/null +++ b/checkpoint-228/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:041cfaa5bf8383821dea4fa5a9d2eab2caad4644c4cd651398c8b0ab1541b270 +size 7992 diff --git a/checkpoint-228/zero_to_fp32.py b/checkpoint-228/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-228/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-304/README.md b/checkpoint-304/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fdf619c317c2fe82074662582dbd68166b6f9d50 --- /dev/null +++ b/checkpoint-304/README.md @@ -0,0 +1,202 @@ +--- +base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-304/adapter_config.json b/checkpoint-304/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a192388a7b55129be9ad9168abc396b47bbda6f7 --- /dev/null +++ b/checkpoint-304/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "o_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-304/adapter_model.safetensors b/checkpoint-304/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f08224c74d70e03af0a497046bdf0f4516ade3e6 --- /dev/null +++ b/checkpoint-304/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8031acd919098475b574e499e8b25ee9c9ff713cae572ada2607a2143510a4dc +size 10829849744 diff --git a/checkpoint-304/global_step304/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-304/global_step304/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6061ef3170f17b5a4d43ee12447233d44c6dfbc --- /dev/null +++ b/checkpoint-304/global_step304/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b93ac81ce89293290dd1259cd1427457fe7e5aa0594fc5e0a1cbbf474bf3430 +size 21659418140 diff --git a/checkpoint-304/global_step304/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-304/global_step304/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5791f3527f0413d866ec01a666be3943dafe851 --- /dev/null +++ b/checkpoint-304/global_step304/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caaec06ddb51fc223fcd558dea83fe474b40007fc598fab2a6e80db629ca3c88 +size 21659457372 diff --git a/checkpoint-304/global_step304/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-304/global_step304/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..799c40374aa2cfb0d2e150d55ff87bff0d0b513a --- /dev/null +++ b/checkpoint-304/global_step304/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65c702199988cf67ae32a4deeec7d4b4e3237119f35f8060bc27598d799fd019 +size 21659417820 diff --git a/checkpoint-304/global_step304/mp_rank_00_model_states.pt b/checkpoint-304/global_step304/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..43e4fd7ad7a3b582cf45b67cabc8f215306cfafb --- /dev/null +++ b/checkpoint-304/global_step304/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e25b7f1973710e0b194b6bc9ed555a9f53755bd98e0d309381a1f6d430b1b2b4 +size 11918643933 diff --git a/checkpoint-304/latest b/checkpoint-304/latest new file mode 100644 index 0000000000000000000000000000000000000000..3761843487f150944adef329837340fd2ed0b7ff --- /dev/null +++ b/checkpoint-304/latest @@ -0,0 +1 @@ +global_step304 \ No newline at end of file diff --git a/checkpoint-304/rng_state_0.pth b/checkpoint-304/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ff604c8c4474afbed94401dfd5d6c1473f9d3583 --- /dev/null +++ b/checkpoint-304/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99a3d324f952ce2014535fcab16510a458a3013d4a495eadb02ed7fff34e2363 +size 14768 diff --git a/checkpoint-304/rng_state_1.pth b/checkpoint-304/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef8d9c6fa11f39be0d41b6080c2fdbc5dcfb7349 --- /dev/null +++ b/checkpoint-304/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8aa11659794f1549b168457979fc560787dd21e97b4f2ad4e52b23c8576c2de +size 14768 diff --git a/checkpoint-304/rng_state_2.pth b/checkpoint-304/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5626492adca4ab2303b5cbd44ed33ad4523b3c8e --- /dev/null +++ b/checkpoint-304/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6b24784f5720fe4389d0dffe37c832973528a7aafd8842126a0d5a23d49aff4 +size 14768 diff --git a/checkpoint-304/scheduler.pt b/checkpoint-304/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b5db44d423bc56da0906bba71e4a2c62d1f77f9 --- /dev/null +++ b/checkpoint-304/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd1ee26d42c6bb20a44e8946dadecd2cff0ca8ab5815e472d6bc7cfc6f35c116 +size 1064 diff --git a/checkpoint-304/special_tokens_map.json b/checkpoint-304/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-304/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-304/tokenizer.json b/checkpoint-304/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-304/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-304/tokenizer_config.json b/checkpoint-304/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fdde94c29816839ec3c29d6c6461206a49911f3c --- /dev/null +++ b/checkpoint-304/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-304/trainer_state.json b/checkpoint-304/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f78b4cde269ad3aa5e673cf5b83eae1ffd978c6e --- /dev/null +++ b/checkpoint-304/trainer_state.json @@ -0,0 +1,2161 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 304, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013157894736842105, + "grad_norm": 34.99433898925781, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.595, + "step": 1 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 35.6848258972168, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.6447, + "step": 2 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 35.07997512817383, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.5819, + "step": 3 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 34.3863525390625, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.5739, + "step": 4 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 35.443077087402344, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.6071, + "step": 5 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 34.70173263549805, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.5487, + "step": 6 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 34.421295166015625, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.5494, + "step": 7 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 35.152748107910156, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.5936, + "step": 8 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 34.947021484375, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.5574, + "step": 9 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 34.67315673828125, + "learning_rate": 5.000000000000001e-07, + "loss": 2.4894, + "step": 10 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 34.679954528808594, + "learning_rate": 5.5e-07, + "loss": 2.4985, + "step": 11 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 33.57002258300781, + "learning_rate": 6.000000000000001e-07, + "loss": 2.4339, + "step": 12 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 33.517276763916016, + "learning_rate": 6.5e-07, + "loss": 2.4055, + "step": 13 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 33.5312385559082, + "learning_rate": 7.000000000000001e-07, + "loss": 2.3806, + "step": 14 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 32.01276779174805, + "learning_rate": 7.5e-07, + "loss": 2.2505, + "step": 15 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 31.827980041503906, + "learning_rate": 8.000000000000001e-07, + "loss": 2.1359, + "step": 16 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 31.437101364135742, + "learning_rate": 8.500000000000001e-07, + "loss": 2.1117, + "step": 17 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 30.315187454223633, + "learning_rate": 9.000000000000001e-07, + "loss": 1.9795, + "step": 18 + }, + { + "epoch": 0.25, + "grad_norm": 29.622655868530273, + "learning_rate": 9.500000000000001e-07, + "loss": 1.8472, + "step": 19 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 28.628408432006836, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.7283, + "step": 20 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 27.83180046081543, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.5942, + "step": 21 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 26.911596298217773, + "learning_rate": 1.1e-06, + "loss": 1.4467, + "step": 22 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 25.88102149963379, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3007, + "step": 23 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 25.146381378173828, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.1319, + "step": 24 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 24.800382614135742, + "learning_rate": 1.25e-06, + "loss": 0.9359, + "step": 25 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 24.648332595825195, + "learning_rate": 1.3e-06, + "loss": 0.7054, + "step": 26 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 22.947620391845703, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.5209, + "step": 27 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 17.80010414123535, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.3546, + "step": 28 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 11.841789245605469, + "learning_rate": 1.45e-06, + "loss": 0.26, + "step": 29 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 7.275839805603027, + "learning_rate": 1.5e-06, + "loss": 0.1808, + "step": 30 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 4.6324543952941895, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1464, + "step": 31 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.1281485557556152, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1079, + "step": 32 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 2.062562942504883, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0966, + "step": 33 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 2.1343328952789307, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.088, + "step": 34 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 1.6768524646759033, + "learning_rate": 1.75e-06, + "loss": 0.0783, + "step": 35 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 1.0879229307174683, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0623, + "step": 36 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 0.83177649974823, + "learning_rate": 1.85e-06, + "loss": 0.0655, + "step": 37 + }, + { + "epoch": 0.5, + "grad_norm": 0.5678385496139526, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0565, + "step": 38 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 0.6994458436965942, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0491, + "step": 39 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.711387038230896, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0507, + "step": 40 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 0.7169735431671143, + "learning_rate": 2.05e-06, + "loss": 0.0478, + "step": 41 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.603631317615509, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0507, + "step": 42 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 0.617487907409668, + "learning_rate": 2.15e-06, + "loss": 0.043, + "step": 43 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.4638065993785858, + "learning_rate": 2.2e-06, + "loss": 0.0472, + "step": 44 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.5996385216712952, + "learning_rate": 2.25e-06, + "loss": 0.0429, + "step": 45 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.39118286967277527, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0421, + "step": 46 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 0.3118075728416443, + "learning_rate": 2.35e-06, + "loss": 0.0383, + "step": 47 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.31731992959976196, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.041, + "step": 48 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 0.5413194298744202, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0397, + "step": 49 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.32958006858825684, + "learning_rate": 2.5e-06, + "loss": 0.0355, + "step": 50 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 0.596309244632721, + "learning_rate": 2.55e-06, + "loss": 0.0413, + "step": 51 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.4557362496852875, + "learning_rate": 2.6e-06, + "loss": 0.0461, + "step": 52 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 0.3345410227775574, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0385, + "step": 53 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.3047848343849182, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0383, + "step": 54 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 0.43763449788093567, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.038, + "step": 55 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.26870036125183105, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0374, + "step": 56 + }, + { + "epoch": 0.75, + "grad_norm": 0.38762542605400085, + "learning_rate": 2.85e-06, + "loss": 0.0349, + "step": 57 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.27517396211624146, + "learning_rate": 2.9e-06, + "loss": 0.0398, + "step": 58 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 0.30815261602401733, + "learning_rate": 2.95e-06, + "loss": 0.0364, + "step": 59 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.30011361837387085, + "learning_rate": 3e-06, + "loss": 0.0307, + "step": 60 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 0.3269154727458954, + "learning_rate": 3.05e-06, + "loss": 0.0344, + "step": 61 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.3750869333744049, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0339, + "step": 62 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 0.29285815358161926, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.034, + "step": 63 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.4157550632953644, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0348, + "step": 64 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 0.2852867543697357, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0319, + "step": 65 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.4384031593799591, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0319, + "step": 66 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 0.4003254771232605, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0347, + "step": 67 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.49913832545280457, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0347, + "step": 68 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 0.22642269730567932, + "learning_rate": 3.45e-06, + "loss": 0.0306, + "step": 69 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.34004101157188416, + "learning_rate": 3.5e-06, + "loss": 0.0337, + "step": 70 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 0.21503636240959167, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0311, + "step": 71 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.33802086114883423, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0293, + "step": 72 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 0.2488064169883728, + "learning_rate": 3.65e-06, + "loss": 0.0318, + "step": 73 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.21124528348445892, + "learning_rate": 3.7e-06, + "loss": 0.0293, + "step": 74 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 0.3108712136745453, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0288, + "step": 75 + }, + { + "epoch": 1.0, + "grad_norm": 0.33483418822288513, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.031, + "step": 76 + }, + { + "epoch": 1.013157894736842, + "grad_norm": 0.3099130690097809, + "learning_rate": 3.85e-06, + "loss": 0.0286, + "step": 77 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 0.22946476936340332, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0307, + "step": 78 + }, + { + "epoch": 1.0394736842105263, + "grad_norm": 0.36924120783805847, + "learning_rate": 3.95e-06, + "loss": 0.0274, + "step": 79 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.30895617604255676, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0274, + "step": 80 + }, + { + "epoch": 1.0657894736842106, + "grad_norm": 0.42033568024635315, + "learning_rate": 4.05e-06, + "loss": 0.0298, + "step": 81 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 0.35573887825012207, + "learning_rate": 4.1e-06, + "loss": 0.0286, + "step": 82 + }, + { + "epoch": 1.0921052631578947, + "grad_norm": 0.24631913006305695, + "learning_rate": 4.15e-06, + "loss": 0.0294, + "step": 83 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 0.2908592224121094, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0243, + "step": 84 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 0.3293064832687378, + "learning_rate": 4.25e-06, + "loss": 0.0253, + "step": 85 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 0.3789626359939575, + "learning_rate": 4.3e-06, + "loss": 0.0253, + "step": 86 + }, + { + "epoch": 1.1447368421052633, + "grad_norm": 0.3900983929634094, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0248, + "step": 87 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.28972727060317993, + "learning_rate": 4.4e-06, + "loss": 0.0256, + "step": 88 + }, + { + "epoch": 1.1710526315789473, + "grad_norm": 0.4615432620048523, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0259, + "step": 89 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.3959222137928009, + "learning_rate": 4.5e-06, + "loss": 0.0277, + "step": 90 + }, + { + "epoch": 1.1973684210526316, + "grad_norm": 0.4927828907966614, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0251, + "step": 91 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.23854510486125946, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0214, + "step": 92 + }, + { + "epoch": 1.2236842105263157, + "grad_norm": 0.2470882534980774, + "learning_rate": 4.65e-06, + "loss": 0.0255, + "step": 93 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 0.22575952112674713, + "learning_rate": 4.7e-06, + "loss": 0.0208, + "step": 94 + }, + { + "epoch": 1.25, + "grad_norm": 0.437495619058609, + "learning_rate": 4.75e-06, + "loss": 0.0234, + "step": 95 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.2712303102016449, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0236, + "step": 96 + }, + { + "epoch": 1.2763157894736843, + "grad_norm": 0.2843461334705353, + "learning_rate": 4.85e-06, + "loss": 0.0195, + "step": 97 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 0.21141311526298523, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0223, + "step": 98 + }, + { + "epoch": 1.3026315789473684, + "grad_norm": 0.25484079122543335, + "learning_rate": 4.95e-06, + "loss": 0.0211, + "step": 99 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.358674556016922, + "learning_rate": 5e-06, + "loss": 0.027, + "step": 100 + }, + { + "epoch": 1.3289473684210527, + "grad_norm": 0.20442990958690643, + "learning_rate": 4.999902656502973e-06, + "loss": 0.0234, + "step": 101 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.2281407117843628, + "learning_rate": 4.9996106335924965e-06, + "loss": 0.0243, + "step": 102 + }, + { + "epoch": 1.3552631578947367, + "grad_norm": 0.23803724348545074, + "learning_rate": 4.999123954009797e-06, + "loss": 0.0189, + "step": 103 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.21493025124073029, + "learning_rate": 4.998442655654946e-06, + "loss": 0.0207, + "step": 104 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 0.2565159797668457, + "learning_rate": 4.997566791583916e-06, + "loss": 0.0178, + "step": 105 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 0.3488551378250122, + "learning_rate": 4.996496430004446e-06, + "loss": 0.0226, + "step": 106 + }, + { + "epoch": 1.4078947368421053, + "grad_norm": 0.27695611119270325, + "learning_rate": 4.995231654270726e-06, + "loss": 0.0189, + "step": 107 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.23477056622505188, + "learning_rate": 4.993772562876909e-06, + "loss": 0.0182, + "step": 108 + }, + { + "epoch": 1.4342105263157894, + "grad_norm": 0.22611404955387115, + "learning_rate": 4.992119269449445e-06, + "loss": 0.0168, + "step": 109 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.25616368651390076, + "learning_rate": 4.990271902738223e-06, + "loss": 0.022, + "step": 110 + }, + { + "epoch": 1.4605263157894737, + "grad_norm": 0.23842717707157135, + "learning_rate": 4.988230606606552e-06, + "loss": 0.0163, + "step": 111 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.24285905063152313, + "learning_rate": 4.985995540019956e-06, + "loss": 0.0202, + "step": 112 + }, + { + "epoch": 1.486842105263158, + "grad_norm": 0.24602730572223663, + "learning_rate": 4.983566877033791e-06, + "loss": 0.0173, + "step": 113 + }, + { + "epoch": 1.5, + "grad_norm": 0.26218464970588684, + "learning_rate": 4.980944806779698e-06, + "loss": 0.0206, + "step": 114 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 0.2999787926673889, + "learning_rate": 4.9781295334508664e-06, + "loss": 0.0178, + "step": 115 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.20500704646110535, + "learning_rate": 4.975121276286136e-06, + "loss": 0.0181, + "step": 116 + }, + { + "epoch": 1.5394736842105263, + "grad_norm": 0.25106561183929443, + "learning_rate": 4.9719202695529265e-06, + "loss": 0.0128, + "step": 117 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.2686936855316162, + "learning_rate": 4.968526762528988e-06, + "loss": 0.0146, + "step": 118 + }, + { + "epoch": 1.5657894736842106, + "grad_norm": 0.2770400047302246, + "learning_rate": 4.964941019482995e-06, + "loss": 0.0167, + "step": 119 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.27510589361190796, + "learning_rate": 4.961163319653959e-06, + "loss": 0.0162, + "step": 120 + }, + { + "epoch": 1.5921052631578947, + "grad_norm": 0.3720133602619171, + "learning_rate": 4.9571939572294914e-06, + "loss": 0.0163, + "step": 121 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.2288741022348404, + "learning_rate": 4.953033241322887e-06, + "loss": 0.0133, + "step": 122 + }, + { + "epoch": 1.618421052631579, + "grad_norm": 0.31084850430488586, + "learning_rate": 4.948681495949055e-06, + "loss": 0.0124, + "step": 123 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.19490985572338104, + "learning_rate": 4.944139059999286e-06, + "loss": 0.0114, + "step": 124 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 0.3074445426464081, + "learning_rate": 4.939406287214861e-06, + "loss": 0.0153, + "step": 125 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 0.29279908537864685, + "learning_rate": 4.9344835461595016e-06, + "loss": 0.0117, + "step": 126 + }, + { + "epoch": 1.6710526315789473, + "grad_norm": 0.3299407362937927, + "learning_rate": 4.929371220190671e-06, + "loss": 0.0128, + "step": 127 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.24818794429302216, + "learning_rate": 4.9240697074297205e-06, + "loss": 0.0146, + "step": 128 + }, + { + "epoch": 1.6973684210526314, + "grad_norm": 0.35983219742774963, + "learning_rate": 4.918579420730884e-06, + "loss": 0.0138, + "step": 129 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.2583932876586914, + "learning_rate": 4.912900787649124e-06, + "loss": 0.0136, + "step": 130 + }, + { + "epoch": 1.723684210526316, + "grad_norm": 0.20754319429397583, + "learning_rate": 4.907034250406846e-06, + "loss": 0.0116, + "step": 131 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.30609601736068726, + "learning_rate": 4.900980265859449e-06, + "loss": 0.0111, + "step": 132 + }, + { + "epoch": 1.75, + "grad_norm": 0.3754304349422455, + "learning_rate": 4.894739305459754e-06, + "loss": 0.0126, + "step": 133 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 0.2517055571079254, + "learning_rate": 4.88831185522129e-06, + "loss": 0.0118, + "step": 134 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 0.198478102684021, + "learning_rate": 4.881698415680442e-06, + "loss": 0.0087, + "step": 135 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.2307695895433426, + "learning_rate": 4.874899501857477e-06, + "loss": 0.0094, + "step": 136 + }, + { + "epoch": 1.8026315789473686, + "grad_norm": 0.17823486030101776, + "learning_rate": 4.867915643216434e-06, + "loss": 0.0098, + "step": 137 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.2157433032989502, + "learning_rate": 4.860747383623889e-06, + "loss": 0.0114, + "step": 138 + }, + { + "epoch": 1.8289473684210527, + "grad_norm": 0.21051311492919922, + "learning_rate": 4.85339528130661e-06, + "loss": 0.011, + "step": 139 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.32886555790901184, + "learning_rate": 4.845859908808074e-06, + "loss": 0.011, + "step": 140 + }, + { + "epoch": 1.8552631578947367, + "grad_norm": 0.22413378953933716, + "learning_rate": 4.838141852943891e-06, + "loss": 0.0087, + "step": 141 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.2896019518375397, + "learning_rate": 4.830241714756099e-06, + "loss": 0.011, + "step": 142 + }, + { + "epoch": 1.881578947368421, + "grad_norm": 0.26163023710250854, + "learning_rate": 4.822160109466361e-06, + "loss": 0.0084, + "step": 143 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.23998413980007172, + "learning_rate": 4.813897666428054e-06, + "loss": 0.0094, + "step": 144 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 0.2334728091955185, + "learning_rate": 4.805455029077255e-06, + "loss": 0.007, + "step": 145 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.17431940138339996, + "learning_rate": 4.79683285488264e-06, + "loss": 0.0047, + "step": 146 + }, + { + "epoch": 1.9342105263157894, + "grad_norm": 0.19151932001113892, + "learning_rate": 4.788031815294282e-06, + "loss": 0.0056, + "step": 147 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 0.2352588027715683, + "learning_rate": 4.779052595691355e-06, + "loss": 0.0107, + "step": 148 + }, + { + "epoch": 1.9605263157894737, + "grad_norm": 0.2848915159702301, + "learning_rate": 4.76989589532877e-06, + "loss": 0.0074, + "step": 149 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.218011736869812, + "learning_rate": 4.7605624272827125e-06, + "loss": 0.0075, + "step": 150 + }, + { + "epoch": 1.986842105263158, + "grad_norm": 0.3043143153190613, + "learning_rate": 4.75105291839512e-06, + "loss": 0.0073, + "step": 151 + }, + { + "epoch": 2.0, + "grad_norm": 0.16677772998809814, + "learning_rate": 4.741368109217072e-06, + "loss": 0.0065, + "step": 152 + }, + { + "epoch": 2.013157894736842, + "grad_norm": 0.14940837025642395, + "learning_rate": 4.7315087539511225e-06, + "loss": 0.0034, + "step": 153 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 0.14960654079914093, + "learning_rate": 4.721475620392567e-06, + "loss": 0.0034, + "step": 154 + }, + { + "epoch": 2.039473684210526, + "grad_norm": 0.2261868566274643, + "learning_rate": 4.711269489869654e-06, + "loss": 0.0034, + "step": 155 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 0.20907555520534515, + "learning_rate": 4.700891157182729e-06, + "loss": 0.0045, + "step": 156 + }, + { + "epoch": 2.0657894736842106, + "grad_norm": 0.15571005642414093, + "learning_rate": 4.690341430542351e-06, + "loss": 0.0032, + "step": 157 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 0.16968725621700287, + "learning_rate": 4.679621131506347e-06, + "loss": 0.0044, + "step": 158 + }, + { + "epoch": 2.0921052631578947, + "grad_norm": 0.1937742531299591, + "learning_rate": 4.668731094915835e-06, + "loss": 0.0027, + "step": 159 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.14914007484912872, + "learning_rate": 4.657672168830211e-06, + "loss": 0.0031, + "step": 160 + }, + { + "epoch": 2.1184210526315788, + "grad_norm": 0.19651293754577637, + "learning_rate": 4.646445214461105e-06, + "loss": 0.0043, + "step": 161 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 0.2023143172264099, + "learning_rate": 4.635051106105316e-06, + "loss": 0.0036, + "step": 162 + }, + { + "epoch": 2.1447368421052633, + "grad_norm": 0.17952999472618103, + "learning_rate": 4.623490731076728e-06, + "loss": 0.0024, + "step": 163 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.18410134315490723, + "learning_rate": 4.6117649896372055e-06, + "loss": 0.0054, + "step": 164 + }, + { + "epoch": 2.1710526315789473, + "grad_norm": 0.18808087706565857, + "learning_rate": 4.59987479492649e-06, + "loss": 0.0039, + "step": 165 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 0.12346187978982925, + "learning_rate": 4.587821072891089e-06, + "loss": 0.0036, + "step": 166 + }, + { + "epoch": 2.1973684210526314, + "grad_norm": 0.140532448887825, + "learning_rate": 4.5756047622121665e-06, + "loss": 0.0028, + "step": 167 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.8201041221618652, + "learning_rate": 4.563226814232444e-06, + "loss": 0.0028, + "step": 168 + }, + { + "epoch": 2.223684210526316, + "grad_norm": 0.26919177174568176, + "learning_rate": 4.550688192882115e-06, + "loss": 0.0032, + "step": 169 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.1321878731250763, + "learning_rate": 4.53798987460378e-06, + "loss": 0.0023, + "step": 170 + }, + { + "epoch": 2.25, + "grad_norm": 0.12545251846313477, + "learning_rate": 4.525132848276405e-06, + "loss": 0.0024, + "step": 171 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 0.1377445012331009, + "learning_rate": 4.512118115138315e-06, + "loss": 0.0033, + "step": 172 + }, + { + "epoch": 2.276315789473684, + "grad_norm": 0.10942364484071732, + "learning_rate": 4.498946688709216e-06, + "loss": 0.0023, + "step": 173 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 0.17425717413425446, + "learning_rate": 4.485619594711278e-06, + "loss": 0.003, + "step": 174 + }, + { + "epoch": 2.3026315789473686, + "grad_norm": 0.15876342356204987, + "learning_rate": 4.4721378709892475e-06, + "loss": 0.0034, + "step": 175 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.1537884920835495, + "learning_rate": 4.4585025674296315e-06, + "loss": 0.002, + "step": 176 + }, + { + "epoch": 2.3289473684210527, + "grad_norm": 0.13558532297611237, + "learning_rate": 4.444714745878936e-06, + "loss": 0.0021, + "step": 177 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 0.14405666291713715, + "learning_rate": 4.430775480060973e-06, + "loss": 0.0028, + "step": 178 + }, + { + "epoch": 2.3552631578947367, + "grad_norm": 0.19296719133853912, + "learning_rate": 4.416685855493246e-06, + "loss": 0.0034, + "step": 179 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.2153935730457306, + "learning_rate": 4.4024469694024194e-06, + "loss": 0.0032, + "step": 180 + }, + { + "epoch": 2.3815789473684212, + "grad_norm": 0.11674188822507858, + "learning_rate": 4.388059930638865e-06, + "loss": 0.0013, + "step": 181 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 0.21349935233592987, + "learning_rate": 4.373525859590313e-06, + "loss": 0.002, + "step": 182 + }, + { + "epoch": 2.4078947368421053, + "grad_norm": 0.16676126420497894, + "learning_rate": 4.358845888094607e-06, + "loss": 0.0015, + "step": 183 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.20975975692272186, + "learning_rate": 4.3440211593515556e-06, + "loss": 0.0025, + "step": 184 + }, + { + "epoch": 2.4342105263157894, + "grad_norm": 0.3014683425426483, + "learning_rate": 4.32905282783391e-06, + "loss": 0.0031, + "step": 185 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 0.1687438040971756, + "learning_rate": 4.313942059197457e-06, + "loss": 0.0014, + "step": 186 + }, + { + "epoch": 2.4605263157894735, + "grad_norm": 0.13351179659366608, + "learning_rate": 4.298690030190247e-06, + "loss": 0.0012, + "step": 187 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 0.4079025387763977, + "learning_rate": 4.283297928560951e-06, + "loss": 0.0026, + "step": 188 + }, + { + "epoch": 2.486842105263158, + "grad_norm": 0.12639036774635315, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0017, + "step": 189 + }, + { + "epoch": 2.5, + "grad_norm": 0.1551010012626648, + "learning_rate": 4.252098312878083e-06, + "loss": 0.0022, + "step": 190 + }, + { + "epoch": 2.513157894736842, + "grad_norm": 0.1431741863489151, + "learning_rate": 4.236293228488267e-06, + "loss": 0.0022, + "step": 191 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.39600786566734314, + "learning_rate": 4.220352930614672e-06, + "loss": 0.0031, + "step": 192 + }, + { + "epoch": 2.5394736842105265, + "grad_norm": 0.13951376080513, + "learning_rate": 4.204278660604767e-06, + "loss": 0.0016, + "step": 193 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 0.10893042385578156, + "learning_rate": 4.1880716702390764e-06, + "loss": 0.0007, + "step": 194 + }, + { + "epoch": 2.5657894736842106, + "grad_norm": 0.16801239550113678, + "learning_rate": 4.171733221633695e-06, + "loss": 0.002, + "step": 195 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 0.11393369734287262, + "learning_rate": 4.155264587142002e-06, + "loss": 0.0017, + "step": 196 + }, + { + "epoch": 2.5921052631578947, + "grad_norm": 0.23128700256347656, + "learning_rate": 4.138667049255574e-06, + "loss": 0.0018, + "step": 197 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 0.06730300188064575, + "learning_rate": 4.121941900504316e-06, + "loss": 0.0006, + "step": 198 + }, + { + "epoch": 2.6184210526315788, + "grad_norm": 0.11693810671567917, + "learning_rate": 4.105090443355801e-06, + "loss": 0.0012, + "step": 199 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.1186346486210823, + "learning_rate": 4.088113990113846e-06, + "loss": 0.0012, + "step": 200 + }, + { + "epoch": 2.6447368421052633, + "grad_norm": 0.2584531605243683, + "learning_rate": 4.071013862816311e-06, + "loss": 0.0025, + "step": 201 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 0.09868124127388, + "learning_rate": 4.0537913931321495e-06, + "loss": 0.0017, + "step": 202 + }, + { + "epoch": 2.6710526315789473, + "grad_norm": 0.09907737374305725, + "learning_rate": 4.036447922257699e-06, + "loss": 0.002, + "step": 203 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.06743597984313965, + "learning_rate": 4.018984800812248e-06, + "loss": 0.0006, + "step": 204 + }, + { + "epoch": 2.6973684210526314, + "grad_norm": 0.08913715183734894, + "learning_rate": 4.001403388732842e-06, + "loss": 0.0007, + "step": 205 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 0.12334564328193665, + "learning_rate": 3.983705055168391e-06, + "loss": 0.0006, + "step": 206 + }, + { + "epoch": 2.723684210526316, + "grad_norm": 0.10878646373748779, + "learning_rate": 3.965891178373038e-06, + "loss": 0.0016, + "step": 207 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.10623253136873245, + "learning_rate": 3.947963145598833e-06, + "loss": 0.0015, + "step": 208 + }, + { + "epoch": 2.75, + "grad_norm": 0.15580499172210693, + "learning_rate": 3.929922352987702e-06, + "loss": 0.0011, + "step": 209 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.06405838578939438, + "learning_rate": 3.911770205462717e-06, + "loss": 0.0007, + "step": 210 + }, + { + "epoch": 2.776315789473684, + "grad_norm": 0.17784689366817474, + "learning_rate": 3.8935081166186935e-06, + "loss": 0.0017, + "step": 211 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 0.14516866207122803, + "learning_rate": 3.875137508612104e-06, + "loss": 0.0014, + "step": 212 + }, + { + "epoch": 2.8026315789473686, + "grad_norm": 0.09510776400566101, + "learning_rate": 3.856659812050328e-06, + "loss": 0.0009, + "step": 213 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 0.1000828891992569, + "learning_rate": 3.838076465880248e-06, + "loss": 0.0008, + "step": 214 + }, + { + "epoch": 2.8289473684210527, + "grad_norm": 0.10773428529500961, + "learning_rate": 3.819388917276186e-06, + "loss": 0.0008, + "step": 215 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.12319213151931763, + "learning_rate": 3.8005986215272056e-06, + "loss": 0.0007, + "step": 216 + }, + { + "epoch": 2.8552631578947367, + "grad_norm": 0.07209170609712601, + "learning_rate": 3.7817070419237866e-06, + "loss": 0.0006, + "step": 217 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 0.12889248132705688, + "learning_rate": 3.7627156496438686e-06, + "loss": 0.0005, + "step": 218 + }, + { + "epoch": 2.8815789473684212, + "grad_norm": 0.05019540339708328, + "learning_rate": 3.7436259236382797e-06, + "loss": 0.0003, + "step": 219 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.10657542198896408, + "learning_rate": 3.7244393505155713e-06, + "loss": 0.0008, + "step": 220 + }, + { + "epoch": 2.9078947368421053, + "grad_norm": 0.15984083712100983, + "learning_rate": 3.7051574244262412e-06, + "loss": 0.0003, + "step": 221 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 0.1567343920469284, + "learning_rate": 3.6857816469463806e-06, + "loss": 0.0005, + "step": 222 + }, + { + "epoch": 2.9342105263157894, + "grad_norm": 0.07294822484254837, + "learning_rate": 3.6663135269607413e-06, + "loss": 0.0006, + "step": 223 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.04486699402332306, + "learning_rate": 3.6467545805452266e-06, + "loss": 0.0003, + "step": 224 + }, + { + "epoch": 2.9605263157894735, + "grad_norm": 0.08188032358884811, + "learning_rate": 3.6271063308488298e-06, + "loss": 0.0004, + "step": 225 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 0.029995013028383255, + "learning_rate": 3.6073703079750204e-06, + "loss": 0.0002, + "step": 226 + }, + { + "epoch": 2.986842105263158, + "grad_norm": 0.029155094176530838, + "learning_rate": 3.5875480488625847e-06, + "loss": 0.0002, + "step": 227 + }, + { + "epoch": 3.0, + "grad_norm": 0.058334361761808395, + "learning_rate": 3.5676410971659404e-06, + "loss": 0.0003, + "step": 228 + }, + { + "epoch": 3.013157894736842, + "grad_norm": 0.07227180153131485, + "learning_rate": 3.547651003134921e-06, + "loss": 0.0002, + "step": 229 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 0.03385070338845253, + "learning_rate": 3.527579323494055e-06, + "loss": 0.0001, + "step": 230 + }, + { + "epoch": 3.039473684210526, + "grad_norm": 0.025327105075120926, + "learning_rate": 3.507427621321331e-06, + "loss": 0.0002, + "step": 231 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 0.17931883037090302, + "learning_rate": 3.4871974659264786e-06, + "loss": 0.0004, + "step": 232 + }, + { + "epoch": 3.0657894736842106, + "grad_norm": 0.05045541375875473, + "learning_rate": 3.466890432728754e-06, + "loss": 0.0001, + "step": 233 + }, + { + "epoch": 3.0789473684210527, + "grad_norm": 0.1170194149017334, + "learning_rate": 3.446508103134259e-06, + "loss": 0.0003, + "step": 234 + }, + { + "epoch": 3.0921052631578947, + "grad_norm": 0.01339724287390709, + "learning_rate": 3.426052064412785e-06, + "loss": 0.0001, + "step": 235 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 0.5850052833557129, + "learning_rate": 3.4055239095742067e-06, + "loss": 0.0004, + "step": 236 + }, + { + "epoch": 3.1184210526315788, + "grad_norm": 0.06263412535190582, + "learning_rate": 3.3849252372444295e-06, + "loss": 0.0008, + "step": 237 + }, + { + "epoch": 3.1315789473684212, + "grad_norm": 0.02470085583627224, + "learning_rate": 3.364257651540891e-06, + "loss": 0.0001, + "step": 238 + }, + { + "epoch": 3.1447368421052633, + "grad_norm": 0.1614137440919876, + "learning_rate": 3.343522761947646e-06, + "loss": 0.0004, + "step": 239 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.04927680641412735, + "learning_rate": 3.322722183190025e-06, + "loss": 0.0002, + "step": 240 + }, + { + "epoch": 3.1710526315789473, + "grad_norm": 0.04460301250219345, + "learning_rate": 3.3018575351088894e-06, + "loss": 0.0001, + "step": 241 + }, + { + "epoch": 3.1842105263157894, + "grad_norm": 0.0417938195168972, + "learning_rate": 3.280930442534486e-06, + "loss": 0.0001, + "step": 242 + }, + { + "epoch": 3.1973684210526314, + "grad_norm": 0.04140309989452362, + "learning_rate": 3.2599425351599136e-06, + "loss": 0.0001, + "step": 243 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 0.36172497272491455, + "learning_rate": 3.238895447414211e-06, + "loss": 0.001, + "step": 244 + }, + { + "epoch": 3.223684210526316, + "grad_norm": 0.19054804742336273, + "learning_rate": 3.217790818335077e-06, + "loss": 0.0005, + "step": 245 + }, + { + "epoch": 3.236842105263158, + "grad_norm": 0.031062051653862, + "learning_rate": 3.196630291441231e-06, + "loss": 0.0001, + "step": 246 + }, + { + "epoch": 3.25, + "grad_norm": 0.06263001263141632, + "learning_rate": 3.175415514604422e-06, + "loss": 0.0002, + "step": 247 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 0.016942007467150688, + "learning_rate": 3.154148139921102e-06, + "loss": 0.0001, + "step": 248 + }, + { + "epoch": 3.276315789473684, + "grad_norm": 0.03281901404261589, + "learning_rate": 3.132829823583771e-06, + "loss": 0.0002, + "step": 249 + }, + { + "epoch": 3.2894736842105265, + "grad_norm": 0.06915819644927979, + "learning_rate": 3.1114622257520004e-06, + "loss": 0.0002, + "step": 250 + }, + { + "epoch": 3.3026315789473686, + "grad_norm": 0.029176127165555954, + "learning_rate": 3.0900470104231456e-06, + "loss": 0.0001, + "step": 251 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 0.03844618797302246, + "learning_rate": 3.0685858453027668e-06, + "loss": 0.0002, + "step": 252 + }, + { + "epoch": 3.3289473684210527, + "grad_norm": 0.1381211280822754, + "learning_rate": 3.047080401674754e-06, + "loss": 0.0023, + "step": 253 + }, + { + "epoch": 3.3421052631578947, + "grad_norm": 0.05453269183635712, + "learning_rate": 3.0255323542711784e-06, + "loss": 0.0007, + "step": 254 + }, + { + "epoch": 3.3552631578947367, + "grad_norm": 0.17172302305698395, + "learning_rate": 3.00394338114187e-06, + "loss": 0.0002, + "step": 255 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.09509152173995972, + "learning_rate": 2.9823151635237424e-06, + "loss": 0.0003, + "step": 256 + }, + { + "epoch": 3.3815789473684212, + "grad_norm": 0.049044203013181686, + "learning_rate": 2.9606493857098657e-06, + "loss": 0.0003, + "step": 257 + }, + { + "epoch": 3.3947368421052633, + "grad_norm": 0.01755386032164097, + "learning_rate": 2.938947734918302e-06, + "loss": 0.0001, + "step": 258 + }, + { + "epoch": 3.4078947368421053, + "grad_norm": 0.06148788705468178, + "learning_rate": 2.9172119011607153e-06, + "loss": 0.0002, + "step": 259 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 0.047921039164066315, + "learning_rate": 2.8954435771107604e-06, + "loss": 0.0001, + "step": 260 + }, + { + "epoch": 3.4342105263157894, + "grad_norm": 0.04058835282921791, + "learning_rate": 2.8736444579722665e-06, + "loss": 0.0002, + "step": 261 + }, + { + "epoch": 3.4473684210526314, + "grad_norm": 0.04581223055720329, + "learning_rate": 2.8518162413472266e-06, + "loss": 0.0003, + "step": 262 + }, + { + "epoch": 3.4605263157894735, + "grad_norm": 0.060498207807540894, + "learning_rate": 2.8299606271035913e-06, + "loss": 0.0002, + "step": 263 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 0.025159459561109543, + "learning_rate": 2.8080793172428965e-06, + "loss": 0.0001, + "step": 264 + }, + { + "epoch": 3.486842105263158, + "grad_norm": 0.009322446770966053, + "learning_rate": 2.786174015767721e-06, + "loss": 0.0, + "step": 265 + }, + { + "epoch": 3.5, + "grad_norm": 0.048437751829624176, + "learning_rate": 2.764246428548983e-06, + "loss": 0.0002, + "step": 266 + }, + { + "epoch": 3.513157894736842, + "grad_norm": 0.025815390050411224, + "learning_rate": 2.742298263193099e-06, + "loss": 0.0001, + "step": 267 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 0.038374874740839005, + "learning_rate": 2.720331228909005e-06, + "loss": 0.0002, + "step": 268 + }, + { + "epoch": 3.5394736842105265, + "grad_norm": 0.010204033926129341, + "learning_rate": 2.6983470363750497e-06, + "loss": 0.0001, + "step": 269 + }, + { + "epoch": 3.5526315789473686, + "grad_norm": 0.004963386803865433, + "learning_rate": 2.6763473976057776e-06, + "loss": 0.0, + "step": 270 + }, + { + "epoch": 3.5657894736842106, + "grad_norm": 0.0680844709277153, + "learning_rate": 2.6543340258186063e-06, + "loss": 0.0002, + "step": 271 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.01941310614347458, + "learning_rate": 2.6323086353004077e-06, + "loss": 0.0001, + "step": 272 + }, + { + "epoch": 3.5921052631578947, + "grad_norm": 0.07440650463104248, + "learning_rate": 2.610272941274012e-06, + "loss": 0.0001, + "step": 273 + }, + { + "epoch": 3.6052631578947367, + "grad_norm": 0.044731903821229935, + "learning_rate": 2.588228659764632e-06, + "loss": 0.0002, + "step": 274 + }, + { + "epoch": 3.6184210526315788, + "grad_norm": 0.021858500316739082, + "learning_rate": 2.5661775074662276e-06, + "loss": 0.0001, + "step": 275 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 0.03670589625835419, + "learning_rate": 2.544121201607822e-06, + "loss": 0.0002, + "step": 276 + }, + { + "epoch": 3.6447368421052633, + "grad_norm": 0.015562576241791248, + "learning_rate": 2.5220614598197708e-06, + "loss": 0.0001, + "step": 277 + }, + { + "epoch": 3.6578947368421053, + "grad_norm": 0.011691650375723839, + "learning_rate": 2.5e-06, + "loss": 0.0, + "step": 278 + }, + { + "epoch": 3.6710526315789473, + "grad_norm": 0.04999399557709694, + "learning_rate": 2.477938540180231e-06, + "loss": 0.0001, + "step": 279 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 0.07675550132989883, + "learning_rate": 2.455878798392179e-06, + "loss": 0.0001, + "step": 280 + }, + { + "epoch": 3.6973684210526314, + "grad_norm": 0.03533118963241577, + "learning_rate": 2.433822492533774e-06, + "loss": 0.0001, + "step": 281 + }, + { + "epoch": 3.7105263157894735, + "grad_norm": 0.03588724508881569, + "learning_rate": 2.411771340235369e-06, + "loss": 0.0003, + "step": 282 + }, + { + "epoch": 3.723684210526316, + "grad_norm": 0.020976359024643898, + "learning_rate": 2.389727058725989e-06, + "loss": 0.0001, + "step": 283 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 0.008930912241339684, + "learning_rate": 2.3676913646995923e-06, + "loss": 0.0, + "step": 284 + }, + { + "epoch": 3.75, + "grad_norm": 0.02440304309129715, + "learning_rate": 2.3456659741813945e-06, + "loss": 0.0001, + "step": 285 + }, + { + "epoch": 3.763157894736842, + "grad_norm": 0.006568162236362696, + "learning_rate": 2.3236526023942224e-06, + "loss": 0.0, + "step": 286 + }, + { + "epoch": 3.776315789473684, + "grad_norm": 0.004894652403891087, + "learning_rate": 2.301652963624951e-06, + "loss": 0.0, + "step": 287 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.05479031801223755, + "learning_rate": 2.2796687710909966e-06, + "loss": 0.0003, + "step": 288 + }, + { + "epoch": 3.8026315789473686, + "grad_norm": 0.009381825104355812, + "learning_rate": 2.2577017368069017e-06, + "loss": 0.0, + "step": 289 + }, + { + "epoch": 3.8157894736842106, + "grad_norm": 0.011159488931298256, + "learning_rate": 2.235753571451018e-06, + "loss": 0.0, + "step": 290 + }, + { + "epoch": 3.8289473684210527, + "grad_norm": 0.0077633000910282135, + "learning_rate": 2.2138259842322794e-06, + "loss": 0.0, + "step": 291 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 0.040188852697610855, + "learning_rate": 2.191920682757104e-06, + "loss": 0.0002, + "step": 292 + }, + { + "epoch": 3.8552631578947367, + "grad_norm": 0.03593844920396805, + "learning_rate": 2.170039372896409e-06, + "loss": 0.0001, + "step": 293 + }, + { + "epoch": 3.8684210526315788, + "grad_norm": 0.011377676390111446, + "learning_rate": 2.148183758652774e-06, + "loss": 0.0, + "step": 294 + }, + { + "epoch": 3.8815789473684212, + "grad_norm": 0.018670417368412018, + "learning_rate": 2.126355542027734e-06, + "loss": 0.0, + "step": 295 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 0.026110412552952766, + "learning_rate": 2.1045564228892404e-06, + "loss": 0.0001, + "step": 296 + }, + { + "epoch": 3.9078947368421053, + "grad_norm": 0.04683105647563934, + "learning_rate": 2.0827880988392856e-06, + "loss": 0.0001, + "step": 297 + }, + { + "epoch": 3.9210526315789473, + "grad_norm": 0.04568646103143692, + "learning_rate": 2.0610522650816985e-06, + "loss": 0.0001, + "step": 298 + }, + { + "epoch": 3.9342105263157894, + "grad_norm": 0.0064105079509317875, + "learning_rate": 2.0393506142901347e-06, + "loss": 0.0, + "step": 299 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 0.004660090897232294, + "learning_rate": 2.017684836476258e-06, + "loss": 0.0, + "step": 300 + }, + { + "epoch": 3.9605263157894735, + "grad_norm": 0.023000195622444153, + "learning_rate": 1.9960566188581306e-06, + "loss": 0.0001, + "step": 301 + }, + { + "epoch": 3.973684210526316, + "grad_norm": 0.010502593591809273, + "learning_rate": 1.9744676457288225e-06, + "loss": 0.0, + "step": 302 + }, + { + "epoch": 3.986842105263158, + "grad_norm": 0.017971891909837723, + "learning_rate": 1.952919598325247e-06, + "loss": 0.0001, + "step": 303 + }, + { + "epoch": 4.0, + "grad_norm": 0.019201353192329407, + "learning_rate": 1.9314141546972345e-06, + "loss": 0.0001, + "step": 304 + } + ], + "logging_steps": 1, + "max_steps": 456, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 76, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.955687712124882e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-304/training_args.bin b/checkpoint-304/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5a1edbdcc63a93daa09112168cf20c0f8fcb7512 --- /dev/null +++ b/checkpoint-304/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:041cfaa5bf8383821dea4fa5a9d2eab2caad4644c4cd651398c8b0ab1541b270 +size 7992 diff --git a/checkpoint-304/zero_to_fp32.py b/checkpoint-304/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-304/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-380/README.md b/checkpoint-380/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fdf619c317c2fe82074662582dbd68166b6f9d50 --- /dev/null +++ b/checkpoint-380/README.md @@ -0,0 +1,202 @@ +--- +base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-380/adapter_config.json b/checkpoint-380/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a192388a7b55129be9ad9168abc396b47bbda6f7 --- /dev/null +++ b/checkpoint-380/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "o_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-380/adapter_model.safetensors b/checkpoint-380/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b75fc93660393b3cc13e189d24ff6c13d45c04de --- /dev/null +++ b/checkpoint-380/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5f10248781de8c8986a75ccc3d4b70674eabfea266a9228100f1ac8a84131b +size 10829849744 diff --git a/checkpoint-380/global_step380/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-380/global_step380/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..16a67dbb2377b01f71538068072735013654e14f --- /dev/null +++ b/checkpoint-380/global_step380/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:089f52c5c502cd55ac11d7af271077fa5c67ccc0c000db14ec85619957b2a216 +size 21659418140 diff --git a/checkpoint-380/global_step380/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-380/global_step380/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1e5e6a529d6235e21fefb6c8da27b58bf2f1e6d --- /dev/null +++ b/checkpoint-380/global_step380/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:011eb674537c3bcd7b79af708c05b21e1f43b9d5de2c1418e7a0d8a0ff54ad7f +size 21659457372 diff --git a/checkpoint-380/global_step380/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-380/global_step380/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3cbdcb99f00c3d2b0f6da1fdbd78e6ac17254be --- /dev/null +++ b/checkpoint-380/global_step380/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63b89b837434645d644d03acafbead76b20c45577f744d8d30e2d4c762a5e113 +size 21659417820 diff --git a/checkpoint-380/global_step380/mp_rank_00_model_states.pt b/checkpoint-380/global_step380/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d706f3dd03525b63ce80ecccd819cccd9a43091f --- /dev/null +++ b/checkpoint-380/global_step380/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45c1d795b74b5f3050ab1cf311ee5678b3cb4d7155dc814465d5c219e5c19eb0 +size 11918643933 diff --git a/checkpoint-380/latest b/checkpoint-380/latest new file mode 100644 index 0000000000000000000000000000000000000000..97a28d1c33298568d84d9916417869e8f7800fb7 --- /dev/null +++ b/checkpoint-380/latest @@ -0,0 +1 @@ +global_step380 \ No newline at end of file diff --git a/checkpoint-380/rng_state_0.pth b/checkpoint-380/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..360b0f6a03a87f6aed26f672cfc6136b7bbf1611 --- /dev/null +++ b/checkpoint-380/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fe26d5c64cbc7d621141b185caf2009e21d51970e79374540d8781688adeaf8 +size 14768 diff --git a/checkpoint-380/rng_state_1.pth b/checkpoint-380/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..acb1b5e3308c9e88d4c63b5928d441d53b890547 --- /dev/null +++ b/checkpoint-380/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78c608d9a2f68dfe0957985d05001d8334947b7cfbf16e6d2348f077e306d8cc +size 14768 diff --git a/checkpoint-380/rng_state_2.pth b/checkpoint-380/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f9ceea00dd006196381c120c6c3d96bf762fe79 --- /dev/null +++ b/checkpoint-380/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88d769eb76f77879c1fcd6116be4251a854ae051175d403fb920f7282b89fff9 +size 14768 diff --git a/checkpoint-380/scheduler.pt b/checkpoint-380/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d79447f5e29bdc91f65c41e7c702584c9ad3f146 --- /dev/null +++ b/checkpoint-380/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a93097201a634931518316956d488c8df82e81f7fc29c1bde6ce7bd6033e7827 +size 1064 diff --git a/checkpoint-380/special_tokens_map.json b/checkpoint-380/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-380/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-380/tokenizer.json b/checkpoint-380/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-380/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-380/tokenizer_config.json b/checkpoint-380/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fdde94c29816839ec3c29d6c6461206a49911f3c --- /dev/null +++ b/checkpoint-380/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-380/trainer_state.json b/checkpoint-380/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f97ea430dbd30f8526b5a0b3be3ffb8edb7ea56c --- /dev/null +++ b/checkpoint-380/trainer_state.json @@ -0,0 +1,2693 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 380, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013157894736842105, + "grad_norm": 34.99433898925781, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.595, + "step": 1 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 35.6848258972168, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.6447, + "step": 2 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 35.07997512817383, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.5819, + "step": 3 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 34.3863525390625, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.5739, + "step": 4 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 35.443077087402344, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.6071, + "step": 5 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 34.70173263549805, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.5487, + "step": 6 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 34.421295166015625, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.5494, + "step": 7 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 35.152748107910156, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.5936, + "step": 8 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 34.947021484375, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.5574, + "step": 9 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 34.67315673828125, + "learning_rate": 5.000000000000001e-07, + "loss": 2.4894, + "step": 10 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 34.679954528808594, + "learning_rate": 5.5e-07, + "loss": 2.4985, + "step": 11 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 33.57002258300781, + "learning_rate": 6.000000000000001e-07, + "loss": 2.4339, + "step": 12 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 33.517276763916016, + "learning_rate": 6.5e-07, + "loss": 2.4055, + "step": 13 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 33.5312385559082, + "learning_rate": 7.000000000000001e-07, + "loss": 2.3806, + "step": 14 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 32.01276779174805, + "learning_rate": 7.5e-07, + "loss": 2.2505, + "step": 15 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 31.827980041503906, + "learning_rate": 8.000000000000001e-07, + "loss": 2.1359, + "step": 16 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 31.437101364135742, + "learning_rate": 8.500000000000001e-07, + "loss": 2.1117, + "step": 17 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 30.315187454223633, + "learning_rate": 9.000000000000001e-07, + "loss": 1.9795, + "step": 18 + }, + { + "epoch": 0.25, + "grad_norm": 29.622655868530273, + "learning_rate": 9.500000000000001e-07, + "loss": 1.8472, + "step": 19 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 28.628408432006836, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.7283, + "step": 20 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 27.83180046081543, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.5942, + "step": 21 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 26.911596298217773, + "learning_rate": 1.1e-06, + "loss": 1.4467, + "step": 22 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 25.88102149963379, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3007, + "step": 23 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 25.146381378173828, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.1319, + "step": 24 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 24.800382614135742, + "learning_rate": 1.25e-06, + "loss": 0.9359, + "step": 25 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 24.648332595825195, + "learning_rate": 1.3e-06, + "loss": 0.7054, + "step": 26 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 22.947620391845703, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.5209, + "step": 27 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 17.80010414123535, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.3546, + "step": 28 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 11.841789245605469, + "learning_rate": 1.45e-06, + "loss": 0.26, + "step": 29 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 7.275839805603027, + "learning_rate": 1.5e-06, + "loss": 0.1808, + "step": 30 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 4.6324543952941895, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1464, + "step": 31 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.1281485557556152, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1079, + "step": 32 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 2.062562942504883, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0966, + "step": 33 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 2.1343328952789307, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.088, + "step": 34 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 1.6768524646759033, + "learning_rate": 1.75e-06, + "loss": 0.0783, + "step": 35 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 1.0879229307174683, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0623, + "step": 36 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 0.83177649974823, + "learning_rate": 1.85e-06, + "loss": 0.0655, + "step": 37 + }, + { + "epoch": 0.5, + "grad_norm": 0.5678385496139526, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0565, + "step": 38 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 0.6994458436965942, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0491, + "step": 39 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.711387038230896, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0507, + "step": 40 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 0.7169735431671143, + "learning_rate": 2.05e-06, + "loss": 0.0478, + "step": 41 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.603631317615509, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0507, + "step": 42 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 0.617487907409668, + "learning_rate": 2.15e-06, + "loss": 0.043, + "step": 43 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.4638065993785858, + "learning_rate": 2.2e-06, + "loss": 0.0472, + "step": 44 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.5996385216712952, + "learning_rate": 2.25e-06, + "loss": 0.0429, + "step": 45 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.39118286967277527, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0421, + "step": 46 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 0.3118075728416443, + "learning_rate": 2.35e-06, + "loss": 0.0383, + "step": 47 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.31731992959976196, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.041, + "step": 48 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 0.5413194298744202, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0397, + "step": 49 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.32958006858825684, + "learning_rate": 2.5e-06, + "loss": 0.0355, + "step": 50 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 0.596309244632721, + "learning_rate": 2.55e-06, + "loss": 0.0413, + "step": 51 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.4557362496852875, + "learning_rate": 2.6e-06, + "loss": 0.0461, + "step": 52 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 0.3345410227775574, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0385, + "step": 53 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.3047848343849182, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0383, + "step": 54 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 0.43763449788093567, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.038, + "step": 55 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.26870036125183105, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0374, + "step": 56 + }, + { + "epoch": 0.75, + "grad_norm": 0.38762542605400085, + "learning_rate": 2.85e-06, + "loss": 0.0349, + "step": 57 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.27517396211624146, + "learning_rate": 2.9e-06, + "loss": 0.0398, + "step": 58 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 0.30815261602401733, + "learning_rate": 2.95e-06, + "loss": 0.0364, + "step": 59 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.30011361837387085, + "learning_rate": 3e-06, + "loss": 0.0307, + "step": 60 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 0.3269154727458954, + "learning_rate": 3.05e-06, + "loss": 0.0344, + "step": 61 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.3750869333744049, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0339, + "step": 62 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 0.29285815358161926, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.034, + "step": 63 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.4157550632953644, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0348, + "step": 64 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 0.2852867543697357, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0319, + "step": 65 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.4384031593799591, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0319, + "step": 66 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 0.4003254771232605, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0347, + "step": 67 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.49913832545280457, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0347, + "step": 68 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 0.22642269730567932, + "learning_rate": 3.45e-06, + "loss": 0.0306, + "step": 69 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.34004101157188416, + "learning_rate": 3.5e-06, + "loss": 0.0337, + "step": 70 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 0.21503636240959167, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0311, + "step": 71 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.33802086114883423, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0293, + "step": 72 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 0.2488064169883728, + "learning_rate": 3.65e-06, + "loss": 0.0318, + "step": 73 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.21124528348445892, + "learning_rate": 3.7e-06, + "loss": 0.0293, + "step": 74 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 0.3108712136745453, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0288, + "step": 75 + }, + { + "epoch": 1.0, + "grad_norm": 0.33483418822288513, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.031, + "step": 76 + }, + { + "epoch": 1.013157894736842, + "grad_norm": 0.3099130690097809, + "learning_rate": 3.85e-06, + "loss": 0.0286, + "step": 77 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 0.22946476936340332, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0307, + "step": 78 + }, + { + "epoch": 1.0394736842105263, + "grad_norm": 0.36924120783805847, + "learning_rate": 3.95e-06, + "loss": 0.0274, + "step": 79 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.30895617604255676, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0274, + "step": 80 + }, + { + "epoch": 1.0657894736842106, + "grad_norm": 0.42033568024635315, + "learning_rate": 4.05e-06, + "loss": 0.0298, + "step": 81 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 0.35573887825012207, + "learning_rate": 4.1e-06, + "loss": 0.0286, + "step": 82 + }, + { + "epoch": 1.0921052631578947, + "grad_norm": 0.24631913006305695, + "learning_rate": 4.15e-06, + "loss": 0.0294, + "step": 83 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 0.2908592224121094, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0243, + "step": 84 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 0.3293064832687378, + "learning_rate": 4.25e-06, + "loss": 0.0253, + "step": 85 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 0.3789626359939575, + "learning_rate": 4.3e-06, + "loss": 0.0253, + "step": 86 + }, + { + "epoch": 1.1447368421052633, + "grad_norm": 0.3900983929634094, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0248, + "step": 87 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.28972727060317993, + "learning_rate": 4.4e-06, + "loss": 0.0256, + "step": 88 + }, + { + "epoch": 1.1710526315789473, + "grad_norm": 0.4615432620048523, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0259, + "step": 89 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.3959222137928009, + "learning_rate": 4.5e-06, + "loss": 0.0277, + "step": 90 + }, + { + "epoch": 1.1973684210526316, + "grad_norm": 0.4927828907966614, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0251, + "step": 91 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.23854510486125946, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0214, + "step": 92 + }, + { + "epoch": 1.2236842105263157, + "grad_norm": 0.2470882534980774, + "learning_rate": 4.65e-06, + "loss": 0.0255, + "step": 93 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 0.22575952112674713, + "learning_rate": 4.7e-06, + "loss": 0.0208, + "step": 94 + }, + { + "epoch": 1.25, + "grad_norm": 0.437495619058609, + "learning_rate": 4.75e-06, + "loss": 0.0234, + "step": 95 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.2712303102016449, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0236, + "step": 96 + }, + { + "epoch": 1.2763157894736843, + "grad_norm": 0.2843461334705353, + "learning_rate": 4.85e-06, + "loss": 0.0195, + "step": 97 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 0.21141311526298523, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0223, + "step": 98 + }, + { + "epoch": 1.3026315789473684, + "grad_norm": 0.25484079122543335, + "learning_rate": 4.95e-06, + "loss": 0.0211, + "step": 99 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.358674556016922, + "learning_rate": 5e-06, + "loss": 0.027, + "step": 100 + }, + { + "epoch": 1.3289473684210527, + "grad_norm": 0.20442990958690643, + "learning_rate": 4.999902656502973e-06, + "loss": 0.0234, + "step": 101 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.2281407117843628, + "learning_rate": 4.9996106335924965e-06, + "loss": 0.0243, + "step": 102 + }, + { + "epoch": 1.3552631578947367, + "grad_norm": 0.23803724348545074, + "learning_rate": 4.999123954009797e-06, + "loss": 0.0189, + "step": 103 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.21493025124073029, + "learning_rate": 4.998442655654946e-06, + "loss": 0.0207, + "step": 104 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 0.2565159797668457, + "learning_rate": 4.997566791583916e-06, + "loss": 0.0178, + "step": 105 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 0.3488551378250122, + "learning_rate": 4.996496430004446e-06, + "loss": 0.0226, + "step": 106 + }, + { + "epoch": 1.4078947368421053, + "grad_norm": 0.27695611119270325, + "learning_rate": 4.995231654270726e-06, + "loss": 0.0189, + "step": 107 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.23477056622505188, + "learning_rate": 4.993772562876909e-06, + "loss": 0.0182, + "step": 108 + }, + { + "epoch": 1.4342105263157894, + "grad_norm": 0.22611404955387115, + "learning_rate": 4.992119269449445e-06, + "loss": 0.0168, + "step": 109 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.25616368651390076, + "learning_rate": 4.990271902738223e-06, + "loss": 0.022, + "step": 110 + }, + { + "epoch": 1.4605263157894737, + "grad_norm": 0.23842717707157135, + "learning_rate": 4.988230606606552e-06, + "loss": 0.0163, + "step": 111 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.24285905063152313, + "learning_rate": 4.985995540019956e-06, + "loss": 0.0202, + "step": 112 + }, + { + "epoch": 1.486842105263158, + "grad_norm": 0.24602730572223663, + "learning_rate": 4.983566877033791e-06, + "loss": 0.0173, + "step": 113 + }, + { + "epoch": 1.5, + "grad_norm": 0.26218464970588684, + "learning_rate": 4.980944806779698e-06, + "loss": 0.0206, + "step": 114 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 0.2999787926673889, + "learning_rate": 4.9781295334508664e-06, + "loss": 0.0178, + "step": 115 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.20500704646110535, + "learning_rate": 4.975121276286136e-06, + "loss": 0.0181, + "step": 116 + }, + { + "epoch": 1.5394736842105263, + "grad_norm": 0.25106561183929443, + "learning_rate": 4.9719202695529265e-06, + "loss": 0.0128, + "step": 117 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.2686936855316162, + "learning_rate": 4.968526762528988e-06, + "loss": 0.0146, + "step": 118 + }, + { + "epoch": 1.5657894736842106, + "grad_norm": 0.2770400047302246, + "learning_rate": 4.964941019482995e-06, + "loss": 0.0167, + "step": 119 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.27510589361190796, + "learning_rate": 4.961163319653959e-06, + "loss": 0.0162, + "step": 120 + }, + { + "epoch": 1.5921052631578947, + "grad_norm": 0.3720133602619171, + "learning_rate": 4.9571939572294914e-06, + "loss": 0.0163, + "step": 121 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.2288741022348404, + "learning_rate": 4.953033241322887e-06, + "loss": 0.0133, + "step": 122 + }, + { + "epoch": 1.618421052631579, + "grad_norm": 0.31084850430488586, + "learning_rate": 4.948681495949055e-06, + "loss": 0.0124, + "step": 123 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.19490985572338104, + "learning_rate": 4.944139059999286e-06, + "loss": 0.0114, + "step": 124 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 0.3074445426464081, + "learning_rate": 4.939406287214861e-06, + "loss": 0.0153, + "step": 125 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 0.29279908537864685, + "learning_rate": 4.9344835461595016e-06, + "loss": 0.0117, + "step": 126 + }, + { + "epoch": 1.6710526315789473, + "grad_norm": 0.3299407362937927, + "learning_rate": 4.929371220190671e-06, + "loss": 0.0128, + "step": 127 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.24818794429302216, + "learning_rate": 4.9240697074297205e-06, + "loss": 0.0146, + "step": 128 + }, + { + "epoch": 1.6973684210526314, + "grad_norm": 0.35983219742774963, + "learning_rate": 4.918579420730884e-06, + "loss": 0.0138, + "step": 129 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.2583932876586914, + "learning_rate": 4.912900787649124e-06, + "loss": 0.0136, + "step": 130 + }, + { + "epoch": 1.723684210526316, + "grad_norm": 0.20754319429397583, + "learning_rate": 4.907034250406846e-06, + "loss": 0.0116, + "step": 131 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.30609601736068726, + "learning_rate": 4.900980265859449e-06, + "loss": 0.0111, + "step": 132 + }, + { + "epoch": 1.75, + "grad_norm": 0.3754304349422455, + "learning_rate": 4.894739305459754e-06, + "loss": 0.0126, + "step": 133 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 0.2517055571079254, + "learning_rate": 4.88831185522129e-06, + "loss": 0.0118, + "step": 134 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 0.198478102684021, + "learning_rate": 4.881698415680442e-06, + "loss": 0.0087, + "step": 135 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.2307695895433426, + "learning_rate": 4.874899501857477e-06, + "loss": 0.0094, + "step": 136 + }, + { + "epoch": 1.8026315789473686, + "grad_norm": 0.17823486030101776, + "learning_rate": 4.867915643216434e-06, + "loss": 0.0098, + "step": 137 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.2157433032989502, + "learning_rate": 4.860747383623889e-06, + "loss": 0.0114, + "step": 138 + }, + { + "epoch": 1.8289473684210527, + "grad_norm": 0.21051311492919922, + "learning_rate": 4.85339528130661e-06, + "loss": 0.011, + "step": 139 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.32886555790901184, + "learning_rate": 4.845859908808074e-06, + "loss": 0.011, + "step": 140 + }, + { + "epoch": 1.8552631578947367, + "grad_norm": 0.22413378953933716, + "learning_rate": 4.838141852943891e-06, + "loss": 0.0087, + "step": 141 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.2896019518375397, + "learning_rate": 4.830241714756099e-06, + "loss": 0.011, + "step": 142 + }, + { + "epoch": 1.881578947368421, + "grad_norm": 0.26163023710250854, + "learning_rate": 4.822160109466361e-06, + "loss": 0.0084, + "step": 143 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.23998413980007172, + "learning_rate": 4.813897666428054e-06, + "loss": 0.0094, + "step": 144 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 0.2334728091955185, + "learning_rate": 4.805455029077255e-06, + "loss": 0.007, + "step": 145 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.17431940138339996, + "learning_rate": 4.79683285488264e-06, + "loss": 0.0047, + "step": 146 + }, + { + "epoch": 1.9342105263157894, + "grad_norm": 0.19151932001113892, + "learning_rate": 4.788031815294282e-06, + "loss": 0.0056, + "step": 147 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 0.2352588027715683, + "learning_rate": 4.779052595691355e-06, + "loss": 0.0107, + "step": 148 + }, + { + "epoch": 1.9605263157894737, + "grad_norm": 0.2848915159702301, + "learning_rate": 4.76989589532877e-06, + "loss": 0.0074, + "step": 149 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.218011736869812, + "learning_rate": 4.7605624272827125e-06, + "loss": 0.0075, + "step": 150 + }, + { + "epoch": 1.986842105263158, + "grad_norm": 0.3043143153190613, + "learning_rate": 4.75105291839512e-06, + "loss": 0.0073, + "step": 151 + }, + { + "epoch": 2.0, + "grad_norm": 0.16677772998809814, + "learning_rate": 4.741368109217072e-06, + "loss": 0.0065, + "step": 152 + }, + { + "epoch": 2.013157894736842, + "grad_norm": 0.14940837025642395, + "learning_rate": 4.7315087539511225e-06, + "loss": 0.0034, + "step": 153 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 0.14960654079914093, + "learning_rate": 4.721475620392567e-06, + "loss": 0.0034, + "step": 154 + }, + { + "epoch": 2.039473684210526, + "grad_norm": 0.2261868566274643, + "learning_rate": 4.711269489869654e-06, + "loss": 0.0034, + "step": 155 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 0.20907555520534515, + "learning_rate": 4.700891157182729e-06, + "loss": 0.0045, + "step": 156 + }, + { + "epoch": 2.0657894736842106, + "grad_norm": 0.15571005642414093, + "learning_rate": 4.690341430542351e-06, + "loss": 0.0032, + "step": 157 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 0.16968725621700287, + "learning_rate": 4.679621131506347e-06, + "loss": 0.0044, + "step": 158 + }, + { + "epoch": 2.0921052631578947, + "grad_norm": 0.1937742531299591, + "learning_rate": 4.668731094915835e-06, + "loss": 0.0027, + "step": 159 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.14914007484912872, + "learning_rate": 4.657672168830211e-06, + "loss": 0.0031, + "step": 160 + }, + { + "epoch": 2.1184210526315788, + "grad_norm": 0.19651293754577637, + "learning_rate": 4.646445214461105e-06, + "loss": 0.0043, + "step": 161 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 0.2023143172264099, + "learning_rate": 4.635051106105316e-06, + "loss": 0.0036, + "step": 162 + }, + { + "epoch": 2.1447368421052633, + "grad_norm": 0.17952999472618103, + "learning_rate": 4.623490731076728e-06, + "loss": 0.0024, + "step": 163 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.18410134315490723, + "learning_rate": 4.6117649896372055e-06, + "loss": 0.0054, + "step": 164 + }, + { + "epoch": 2.1710526315789473, + "grad_norm": 0.18808087706565857, + "learning_rate": 4.59987479492649e-06, + "loss": 0.0039, + "step": 165 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 0.12346187978982925, + "learning_rate": 4.587821072891089e-06, + "loss": 0.0036, + "step": 166 + }, + { + "epoch": 2.1973684210526314, + "grad_norm": 0.140532448887825, + "learning_rate": 4.5756047622121665e-06, + "loss": 0.0028, + "step": 167 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.8201041221618652, + "learning_rate": 4.563226814232444e-06, + "loss": 0.0028, + "step": 168 + }, + { + "epoch": 2.223684210526316, + "grad_norm": 0.26919177174568176, + "learning_rate": 4.550688192882115e-06, + "loss": 0.0032, + "step": 169 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.1321878731250763, + "learning_rate": 4.53798987460378e-06, + "loss": 0.0023, + "step": 170 + }, + { + "epoch": 2.25, + "grad_norm": 0.12545251846313477, + "learning_rate": 4.525132848276405e-06, + "loss": 0.0024, + "step": 171 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 0.1377445012331009, + "learning_rate": 4.512118115138315e-06, + "loss": 0.0033, + "step": 172 + }, + { + "epoch": 2.276315789473684, + "grad_norm": 0.10942364484071732, + "learning_rate": 4.498946688709216e-06, + "loss": 0.0023, + "step": 173 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 0.17425717413425446, + "learning_rate": 4.485619594711278e-06, + "loss": 0.003, + "step": 174 + }, + { + "epoch": 2.3026315789473686, + "grad_norm": 0.15876342356204987, + "learning_rate": 4.4721378709892475e-06, + "loss": 0.0034, + "step": 175 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.1537884920835495, + "learning_rate": 4.4585025674296315e-06, + "loss": 0.002, + "step": 176 + }, + { + "epoch": 2.3289473684210527, + "grad_norm": 0.13558532297611237, + "learning_rate": 4.444714745878936e-06, + "loss": 0.0021, + "step": 177 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 0.14405666291713715, + "learning_rate": 4.430775480060973e-06, + "loss": 0.0028, + "step": 178 + }, + { + "epoch": 2.3552631578947367, + "grad_norm": 0.19296719133853912, + "learning_rate": 4.416685855493246e-06, + "loss": 0.0034, + "step": 179 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.2153935730457306, + "learning_rate": 4.4024469694024194e-06, + "loss": 0.0032, + "step": 180 + }, + { + "epoch": 2.3815789473684212, + "grad_norm": 0.11674188822507858, + "learning_rate": 4.388059930638865e-06, + "loss": 0.0013, + "step": 181 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 0.21349935233592987, + "learning_rate": 4.373525859590313e-06, + "loss": 0.002, + "step": 182 + }, + { + "epoch": 2.4078947368421053, + "grad_norm": 0.16676126420497894, + "learning_rate": 4.358845888094607e-06, + "loss": 0.0015, + "step": 183 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.20975975692272186, + "learning_rate": 4.3440211593515556e-06, + "loss": 0.0025, + "step": 184 + }, + { + "epoch": 2.4342105263157894, + "grad_norm": 0.3014683425426483, + "learning_rate": 4.32905282783391e-06, + "loss": 0.0031, + "step": 185 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 0.1687438040971756, + "learning_rate": 4.313942059197457e-06, + "loss": 0.0014, + "step": 186 + }, + { + "epoch": 2.4605263157894735, + "grad_norm": 0.13351179659366608, + "learning_rate": 4.298690030190247e-06, + "loss": 0.0012, + "step": 187 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 0.4079025387763977, + "learning_rate": 4.283297928560951e-06, + "loss": 0.0026, + "step": 188 + }, + { + "epoch": 2.486842105263158, + "grad_norm": 0.12639036774635315, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0017, + "step": 189 + }, + { + "epoch": 2.5, + "grad_norm": 0.1551010012626648, + "learning_rate": 4.252098312878083e-06, + "loss": 0.0022, + "step": 190 + }, + { + "epoch": 2.513157894736842, + "grad_norm": 0.1431741863489151, + "learning_rate": 4.236293228488267e-06, + "loss": 0.0022, + "step": 191 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.39600786566734314, + "learning_rate": 4.220352930614672e-06, + "loss": 0.0031, + "step": 192 + }, + { + "epoch": 2.5394736842105265, + "grad_norm": 0.13951376080513, + "learning_rate": 4.204278660604767e-06, + "loss": 0.0016, + "step": 193 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 0.10893042385578156, + "learning_rate": 4.1880716702390764e-06, + "loss": 0.0007, + "step": 194 + }, + { + "epoch": 2.5657894736842106, + "grad_norm": 0.16801239550113678, + "learning_rate": 4.171733221633695e-06, + "loss": 0.002, + "step": 195 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 0.11393369734287262, + "learning_rate": 4.155264587142002e-06, + "loss": 0.0017, + "step": 196 + }, + { + "epoch": 2.5921052631578947, + "grad_norm": 0.23128700256347656, + "learning_rate": 4.138667049255574e-06, + "loss": 0.0018, + "step": 197 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 0.06730300188064575, + "learning_rate": 4.121941900504316e-06, + "loss": 0.0006, + "step": 198 + }, + { + "epoch": 2.6184210526315788, + "grad_norm": 0.11693810671567917, + "learning_rate": 4.105090443355801e-06, + "loss": 0.0012, + "step": 199 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.1186346486210823, + "learning_rate": 4.088113990113846e-06, + "loss": 0.0012, + "step": 200 + }, + { + "epoch": 2.6447368421052633, + "grad_norm": 0.2584531605243683, + "learning_rate": 4.071013862816311e-06, + "loss": 0.0025, + "step": 201 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 0.09868124127388, + "learning_rate": 4.0537913931321495e-06, + "loss": 0.0017, + "step": 202 + }, + { + "epoch": 2.6710526315789473, + "grad_norm": 0.09907737374305725, + "learning_rate": 4.036447922257699e-06, + "loss": 0.002, + "step": 203 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.06743597984313965, + "learning_rate": 4.018984800812248e-06, + "loss": 0.0006, + "step": 204 + }, + { + "epoch": 2.6973684210526314, + "grad_norm": 0.08913715183734894, + "learning_rate": 4.001403388732842e-06, + "loss": 0.0007, + "step": 205 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 0.12334564328193665, + "learning_rate": 3.983705055168391e-06, + "loss": 0.0006, + "step": 206 + }, + { + "epoch": 2.723684210526316, + "grad_norm": 0.10878646373748779, + "learning_rate": 3.965891178373038e-06, + "loss": 0.0016, + "step": 207 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.10623253136873245, + "learning_rate": 3.947963145598833e-06, + "loss": 0.0015, + "step": 208 + }, + { + "epoch": 2.75, + "grad_norm": 0.15580499172210693, + "learning_rate": 3.929922352987702e-06, + "loss": 0.0011, + "step": 209 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.06405838578939438, + "learning_rate": 3.911770205462717e-06, + "loss": 0.0007, + "step": 210 + }, + { + "epoch": 2.776315789473684, + "grad_norm": 0.17784689366817474, + "learning_rate": 3.8935081166186935e-06, + "loss": 0.0017, + "step": 211 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 0.14516866207122803, + "learning_rate": 3.875137508612104e-06, + "loss": 0.0014, + "step": 212 + }, + { + "epoch": 2.8026315789473686, + "grad_norm": 0.09510776400566101, + "learning_rate": 3.856659812050328e-06, + "loss": 0.0009, + "step": 213 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 0.1000828891992569, + "learning_rate": 3.838076465880248e-06, + "loss": 0.0008, + "step": 214 + }, + { + "epoch": 2.8289473684210527, + "grad_norm": 0.10773428529500961, + "learning_rate": 3.819388917276186e-06, + "loss": 0.0008, + "step": 215 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.12319213151931763, + "learning_rate": 3.8005986215272056e-06, + "loss": 0.0007, + "step": 216 + }, + { + "epoch": 2.8552631578947367, + "grad_norm": 0.07209170609712601, + "learning_rate": 3.7817070419237866e-06, + "loss": 0.0006, + "step": 217 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 0.12889248132705688, + "learning_rate": 3.7627156496438686e-06, + "loss": 0.0005, + "step": 218 + }, + { + "epoch": 2.8815789473684212, + "grad_norm": 0.05019540339708328, + "learning_rate": 3.7436259236382797e-06, + "loss": 0.0003, + "step": 219 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.10657542198896408, + "learning_rate": 3.7244393505155713e-06, + "loss": 0.0008, + "step": 220 + }, + { + "epoch": 2.9078947368421053, + "grad_norm": 0.15984083712100983, + "learning_rate": 3.7051574244262412e-06, + "loss": 0.0003, + "step": 221 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 0.1567343920469284, + "learning_rate": 3.6857816469463806e-06, + "loss": 0.0005, + "step": 222 + }, + { + "epoch": 2.9342105263157894, + "grad_norm": 0.07294822484254837, + "learning_rate": 3.6663135269607413e-06, + "loss": 0.0006, + "step": 223 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.04486699402332306, + "learning_rate": 3.6467545805452266e-06, + "loss": 0.0003, + "step": 224 + }, + { + "epoch": 2.9605263157894735, + "grad_norm": 0.08188032358884811, + "learning_rate": 3.6271063308488298e-06, + "loss": 0.0004, + "step": 225 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 0.029995013028383255, + "learning_rate": 3.6073703079750204e-06, + "loss": 0.0002, + "step": 226 + }, + { + "epoch": 2.986842105263158, + "grad_norm": 0.029155094176530838, + "learning_rate": 3.5875480488625847e-06, + "loss": 0.0002, + "step": 227 + }, + { + "epoch": 3.0, + "grad_norm": 0.058334361761808395, + "learning_rate": 3.5676410971659404e-06, + "loss": 0.0003, + "step": 228 + }, + { + "epoch": 3.013157894736842, + "grad_norm": 0.07227180153131485, + "learning_rate": 3.547651003134921e-06, + "loss": 0.0002, + "step": 229 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 0.03385070338845253, + "learning_rate": 3.527579323494055e-06, + "loss": 0.0001, + "step": 230 + }, + { + "epoch": 3.039473684210526, + "grad_norm": 0.025327105075120926, + "learning_rate": 3.507427621321331e-06, + "loss": 0.0002, + "step": 231 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 0.17931883037090302, + "learning_rate": 3.4871974659264786e-06, + "loss": 0.0004, + "step": 232 + }, + { + "epoch": 3.0657894736842106, + "grad_norm": 0.05045541375875473, + "learning_rate": 3.466890432728754e-06, + "loss": 0.0001, + "step": 233 + }, + { + "epoch": 3.0789473684210527, + "grad_norm": 0.1170194149017334, + "learning_rate": 3.446508103134259e-06, + "loss": 0.0003, + "step": 234 + }, + { + "epoch": 3.0921052631578947, + "grad_norm": 0.01339724287390709, + "learning_rate": 3.426052064412785e-06, + "loss": 0.0001, + "step": 235 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 0.5850052833557129, + "learning_rate": 3.4055239095742067e-06, + "loss": 0.0004, + "step": 236 + }, + { + "epoch": 3.1184210526315788, + "grad_norm": 0.06263412535190582, + "learning_rate": 3.3849252372444295e-06, + "loss": 0.0008, + "step": 237 + }, + { + "epoch": 3.1315789473684212, + "grad_norm": 0.02470085583627224, + "learning_rate": 3.364257651540891e-06, + "loss": 0.0001, + "step": 238 + }, + { + "epoch": 3.1447368421052633, + "grad_norm": 0.1614137440919876, + "learning_rate": 3.343522761947646e-06, + "loss": 0.0004, + "step": 239 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.04927680641412735, + "learning_rate": 3.322722183190025e-06, + "loss": 0.0002, + "step": 240 + }, + { + "epoch": 3.1710526315789473, + "grad_norm": 0.04460301250219345, + "learning_rate": 3.3018575351088894e-06, + "loss": 0.0001, + "step": 241 + }, + { + "epoch": 3.1842105263157894, + "grad_norm": 0.0417938195168972, + "learning_rate": 3.280930442534486e-06, + "loss": 0.0001, + "step": 242 + }, + { + "epoch": 3.1973684210526314, + "grad_norm": 0.04140309989452362, + "learning_rate": 3.2599425351599136e-06, + "loss": 0.0001, + "step": 243 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 0.36172497272491455, + "learning_rate": 3.238895447414211e-06, + "loss": 0.001, + "step": 244 + }, + { + "epoch": 3.223684210526316, + "grad_norm": 0.19054804742336273, + "learning_rate": 3.217790818335077e-06, + "loss": 0.0005, + "step": 245 + }, + { + "epoch": 3.236842105263158, + "grad_norm": 0.031062051653862, + "learning_rate": 3.196630291441231e-06, + "loss": 0.0001, + "step": 246 + }, + { + "epoch": 3.25, + "grad_norm": 0.06263001263141632, + "learning_rate": 3.175415514604422e-06, + "loss": 0.0002, + "step": 247 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 0.016942007467150688, + "learning_rate": 3.154148139921102e-06, + "loss": 0.0001, + "step": 248 + }, + { + "epoch": 3.276315789473684, + "grad_norm": 0.03281901404261589, + "learning_rate": 3.132829823583771e-06, + "loss": 0.0002, + "step": 249 + }, + { + "epoch": 3.2894736842105265, + "grad_norm": 0.06915819644927979, + "learning_rate": 3.1114622257520004e-06, + "loss": 0.0002, + "step": 250 + }, + { + "epoch": 3.3026315789473686, + "grad_norm": 0.029176127165555954, + "learning_rate": 3.0900470104231456e-06, + "loss": 0.0001, + "step": 251 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 0.03844618797302246, + "learning_rate": 3.0685858453027668e-06, + "loss": 0.0002, + "step": 252 + }, + { + "epoch": 3.3289473684210527, + "grad_norm": 0.1381211280822754, + "learning_rate": 3.047080401674754e-06, + "loss": 0.0023, + "step": 253 + }, + { + "epoch": 3.3421052631578947, + "grad_norm": 0.05453269183635712, + "learning_rate": 3.0255323542711784e-06, + "loss": 0.0007, + "step": 254 + }, + { + "epoch": 3.3552631578947367, + "grad_norm": 0.17172302305698395, + "learning_rate": 3.00394338114187e-06, + "loss": 0.0002, + "step": 255 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.09509152173995972, + "learning_rate": 2.9823151635237424e-06, + "loss": 0.0003, + "step": 256 + }, + { + "epoch": 3.3815789473684212, + "grad_norm": 0.049044203013181686, + "learning_rate": 2.9606493857098657e-06, + "loss": 0.0003, + "step": 257 + }, + { + "epoch": 3.3947368421052633, + "grad_norm": 0.01755386032164097, + "learning_rate": 2.938947734918302e-06, + "loss": 0.0001, + "step": 258 + }, + { + "epoch": 3.4078947368421053, + "grad_norm": 0.06148788705468178, + "learning_rate": 2.9172119011607153e-06, + "loss": 0.0002, + "step": 259 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 0.047921039164066315, + "learning_rate": 2.8954435771107604e-06, + "loss": 0.0001, + "step": 260 + }, + { + "epoch": 3.4342105263157894, + "grad_norm": 0.04058835282921791, + "learning_rate": 2.8736444579722665e-06, + "loss": 0.0002, + "step": 261 + }, + { + "epoch": 3.4473684210526314, + "grad_norm": 0.04581223055720329, + "learning_rate": 2.8518162413472266e-06, + "loss": 0.0003, + "step": 262 + }, + { + "epoch": 3.4605263157894735, + "grad_norm": 0.060498207807540894, + "learning_rate": 2.8299606271035913e-06, + "loss": 0.0002, + "step": 263 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 0.025159459561109543, + "learning_rate": 2.8080793172428965e-06, + "loss": 0.0001, + "step": 264 + }, + { + "epoch": 3.486842105263158, + "grad_norm": 0.009322446770966053, + "learning_rate": 2.786174015767721e-06, + "loss": 0.0, + "step": 265 + }, + { + "epoch": 3.5, + "grad_norm": 0.048437751829624176, + "learning_rate": 2.764246428548983e-06, + "loss": 0.0002, + "step": 266 + }, + { + "epoch": 3.513157894736842, + "grad_norm": 0.025815390050411224, + "learning_rate": 2.742298263193099e-06, + "loss": 0.0001, + "step": 267 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 0.038374874740839005, + "learning_rate": 2.720331228909005e-06, + "loss": 0.0002, + "step": 268 + }, + { + "epoch": 3.5394736842105265, + "grad_norm": 0.010204033926129341, + "learning_rate": 2.6983470363750497e-06, + "loss": 0.0001, + "step": 269 + }, + { + "epoch": 3.5526315789473686, + "grad_norm": 0.004963386803865433, + "learning_rate": 2.6763473976057776e-06, + "loss": 0.0, + "step": 270 + }, + { + "epoch": 3.5657894736842106, + "grad_norm": 0.0680844709277153, + "learning_rate": 2.6543340258186063e-06, + "loss": 0.0002, + "step": 271 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.01941310614347458, + "learning_rate": 2.6323086353004077e-06, + "loss": 0.0001, + "step": 272 + }, + { + "epoch": 3.5921052631578947, + "grad_norm": 0.07440650463104248, + "learning_rate": 2.610272941274012e-06, + "loss": 0.0001, + "step": 273 + }, + { + "epoch": 3.6052631578947367, + "grad_norm": 0.044731903821229935, + "learning_rate": 2.588228659764632e-06, + "loss": 0.0002, + "step": 274 + }, + { + "epoch": 3.6184210526315788, + "grad_norm": 0.021858500316739082, + "learning_rate": 2.5661775074662276e-06, + "loss": 0.0001, + "step": 275 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 0.03670589625835419, + "learning_rate": 2.544121201607822e-06, + "loss": 0.0002, + "step": 276 + }, + { + "epoch": 3.6447368421052633, + "grad_norm": 0.015562576241791248, + "learning_rate": 2.5220614598197708e-06, + "loss": 0.0001, + "step": 277 + }, + { + "epoch": 3.6578947368421053, + "grad_norm": 0.011691650375723839, + "learning_rate": 2.5e-06, + "loss": 0.0, + "step": 278 + }, + { + "epoch": 3.6710526315789473, + "grad_norm": 0.04999399557709694, + "learning_rate": 2.477938540180231e-06, + "loss": 0.0001, + "step": 279 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 0.07675550132989883, + "learning_rate": 2.455878798392179e-06, + "loss": 0.0001, + "step": 280 + }, + { + "epoch": 3.6973684210526314, + "grad_norm": 0.03533118963241577, + "learning_rate": 2.433822492533774e-06, + "loss": 0.0001, + "step": 281 + }, + { + "epoch": 3.7105263157894735, + "grad_norm": 0.03588724508881569, + "learning_rate": 2.411771340235369e-06, + "loss": 0.0003, + "step": 282 + }, + { + "epoch": 3.723684210526316, + "grad_norm": 0.020976359024643898, + "learning_rate": 2.389727058725989e-06, + "loss": 0.0001, + "step": 283 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 0.008930912241339684, + "learning_rate": 2.3676913646995923e-06, + "loss": 0.0, + "step": 284 + }, + { + "epoch": 3.75, + "grad_norm": 0.02440304309129715, + "learning_rate": 2.3456659741813945e-06, + "loss": 0.0001, + "step": 285 + }, + { + "epoch": 3.763157894736842, + "grad_norm": 0.006568162236362696, + "learning_rate": 2.3236526023942224e-06, + "loss": 0.0, + "step": 286 + }, + { + "epoch": 3.776315789473684, + "grad_norm": 0.004894652403891087, + "learning_rate": 2.301652963624951e-06, + "loss": 0.0, + "step": 287 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.05479031801223755, + "learning_rate": 2.2796687710909966e-06, + "loss": 0.0003, + "step": 288 + }, + { + "epoch": 3.8026315789473686, + "grad_norm": 0.009381825104355812, + "learning_rate": 2.2577017368069017e-06, + "loss": 0.0, + "step": 289 + }, + { + "epoch": 3.8157894736842106, + "grad_norm": 0.011159488931298256, + "learning_rate": 2.235753571451018e-06, + "loss": 0.0, + "step": 290 + }, + { + "epoch": 3.8289473684210527, + "grad_norm": 0.0077633000910282135, + "learning_rate": 2.2138259842322794e-06, + "loss": 0.0, + "step": 291 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 0.040188852697610855, + "learning_rate": 2.191920682757104e-06, + "loss": 0.0002, + "step": 292 + }, + { + "epoch": 3.8552631578947367, + "grad_norm": 0.03593844920396805, + "learning_rate": 2.170039372896409e-06, + "loss": 0.0001, + "step": 293 + }, + { + "epoch": 3.8684210526315788, + "grad_norm": 0.011377676390111446, + "learning_rate": 2.148183758652774e-06, + "loss": 0.0, + "step": 294 + }, + { + "epoch": 3.8815789473684212, + "grad_norm": 0.018670417368412018, + "learning_rate": 2.126355542027734e-06, + "loss": 0.0, + "step": 295 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 0.026110412552952766, + "learning_rate": 2.1045564228892404e-06, + "loss": 0.0001, + "step": 296 + }, + { + "epoch": 3.9078947368421053, + "grad_norm": 0.04683105647563934, + "learning_rate": 2.0827880988392856e-06, + "loss": 0.0001, + "step": 297 + }, + { + "epoch": 3.9210526315789473, + "grad_norm": 0.04568646103143692, + "learning_rate": 2.0610522650816985e-06, + "loss": 0.0001, + "step": 298 + }, + { + "epoch": 3.9342105263157894, + "grad_norm": 0.0064105079509317875, + "learning_rate": 2.0393506142901347e-06, + "loss": 0.0, + "step": 299 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 0.004660090897232294, + "learning_rate": 2.017684836476258e-06, + "loss": 0.0, + "step": 300 + }, + { + "epoch": 3.9605263157894735, + "grad_norm": 0.023000195622444153, + "learning_rate": 1.9960566188581306e-06, + "loss": 0.0001, + "step": 301 + }, + { + "epoch": 3.973684210526316, + "grad_norm": 0.010502593591809273, + "learning_rate": 1.9744676457288225e-06, + "loss": 0.0, + "step": 302 + }, + { + "epoch": 3.986842105263158, + "grad_norm": 0.017971891909837723, + "learning_rate": 1.952919598325247e-06, + "loss": 0.0001, + "step": 303 + }, + { + "epoch": 4.0, + "grad_norm": 0.019201353192329407, + "learning_rate": 1.9314141546972345e-06, + "loss": 0.0001, + "step": 304 + }, + { + "epoch": 4.0131578947368425, + "grad_norm": 0.014774768613278866, + "learning_rate": 1.9099529895768552e-06, + "loss": 0.0001, + "step": 305 + }, + { + "epoch": 4.026315789473684, + "grad_norm": 0.004470722749829292, + "learning_rate": 1.8885377742480005e-06, + "loss": 0.0, + "step": 306 + }, + { + "epoch": 4.0394736842105265, + "grad_norm": 0.0017719039460644126, + "learning_rate": 1.8671701764162287e-06, + "loss": 0.0, + "step": 307 + }, + { + "epoch": 4.052631578947368, + "grad_norm": 0.0023945686407387257, + "learning_rate": 1.8458518600788988e-06, + "loss": 0.0, + "step": 308 + }, + { + "epoch": 4.065789473684211, + "grad_norm": 0.008764254860579967, + "learning_rate": 1.8245844853955786e-06, + "loss": 0.0, + "step": 309 + }, + { + "epoch": 4.078947368421052, + "grad_norm": 0.007987024262547493, + "learning_rate": 1.8033697085587698e-06, + "loss": 0.0, + "step": 310 + }, + { + "epoch": 4.092105263157895, + "grad_norm": 0.0034283557906746864, + "learning_rate": 1.782209181664924e-06, + "loss": 0.0, + "step": 311 + }, + { + "epoch": 4.105263157894737, + "grad_norm": 0.006517547182738781, + "learning_rate": 1.7611045525857902e-06, + "loss": 0.0, + "step": 312 + }, + { + "epoch": 4.118421052631579, + "grad_norm": 0.001932072453200817, + "learning_rate": 1.740057464840088e-06, + "loss": 0.0, + "step": 313 + }, + { + "epoch": 4.131578947368421, + "grad_norm": 0.0014661611057817936, + "learning_rate": 1.7190695574655147e-06, + "loss": 0.0, + "step": 314 + }, + { + "epoch": 4.144736842105263, + "grad_norm": 0.00624146917834878, + "learning_rate": 1.6981424648911112e-06, + "loss": 0.0, + "step": 315 + }, + { + "epoch": 4.157894736842105, + "grad_norm": 0.008011729456484318, + "learning_rate": 1.677277816809975e-06, + "loss": 0.0, + "step": 316 + }, + { + "epoch": 4.171052631578948, + "grad_norm": 0.009674855507910252, + "learning_rate": 1.6564772380523546e-06, + "loss": 0.0, + "step": 317 + }, + { + "epoch": 4.184210526315789, + "grad_norm": 0.0071542332880198956, + "learning_rate": 1.635742348459109e-06, + "loss": 0.0, + "step": 318 + }, + { + "epoch": 4.197368421052632, + "grad_norm": 0.009431720711290836, + "learning_rate": 1.6150747627555713e-06, + "loss": 0.0, + "step": 319 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 0.009224318899214268, + "learning_rate": 1.5944760904257944e-06, + "loss": 0.0, + "step": 320 + }, + { + "epoch": 4.223684210526316, + "grad_norm": 0.010647057555615902, + "learning_rate": 1.5739479355872162e-06, + "loss": 0.0, + "step": 321 + }, + { + "epoch": 4.2368421052631575, + "grad_norm": 0.015793612226843834, + "learning_rate": 1.5534918968657423e-06, + "loss": 0.0, + "step": 322 + }, + { + "epoch": 4.25, + "grad_norm": 0.004004765767604113, + "learning_rate": 1.5331095672712463e-06, + "loss": 0.0, + "step": 323 + }, + { + "epoch": 4.2631578947368425, + "grad_norm": 0.003939881455153227, + "learning_rate": 1.5128025340735223e-06, + "loss": 0.0, + "step": 324 + }, + { + "epoch": 4.276315789473684, + "grad_norm": 0.002702821511775255, + "learning_rate": 1.4925723786786691e-06, + "loss": 0.0, + "step": 325 + }, + { + "epoch": 4.2894736842105265, + "grad_norm": 0.0027509802021086216, + "learning_rate": 1.4724206765059456e-06, + "loss": 0.0, + "step": 326 + }, + { + "epoch": 4.302631578947368, + "grad_norm": 0.007367170415818691, + "learning_rate": 1.4523489968650795e-06, + "loss": 0.0, + "step": 327 + }, + { + "epoch": 4.315789473684211, + "grad_norm": 0.004167403094470501, + "learning_rate": 1.4323589028340598e-06, + "loss": 0.0, + "step": 328 + }, + { + "epoch": 4.328947368421053, + "grad_norm": 0.014962514862418175, + "learning_rate": 1.4124519511374158e-06, + "loss": 0.0, + "step": 329 + }, + { + "epoch": 4.342105263157895, + "grad_norm": 0.0012889838544651866, + "learning_rate": 1.3926296920249796e-06, + "loss": 0.0, + "step": 330 + }, + { + "epoch": 4.355263157894737, + "grad_norm": 0.006448778789490461, + "learning_rate": 1.3728936691511704e-06, + "loss": 0.0, + "step": 331 + }, + { + "epoch": 4.368421052631579, + "grad_norm": 0.004137519281357527, + "learning_rate": 1.3532454194547734e-06, + "loss": 0.0, + "step": 332 + }, + { + "epoch": 4.381578947368421, + "grad_norm": 0.006182932294905186, + "learning_rate": 1.3336864730392587e-06, + "loss": 0.0, + "step": 333 + }, + { + "epoch": 4.394736842105263, + "grad_norm": 0.0014147404581308365, + "learning_rate": 1.314218353053619e-06, + "loss": 0.0, + "step": 334 + }, + { + "epoch": 4.407894736842105, + "grad_norm": 0.007514857687056065, + "learning_rate": 1.2948425755737592e-06, + "loss": 0.0, + "step": 335 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 0.02276490069925785, + "learning_rate": 1.2755606494844294e-06, + "loss": 0.0001, + "step": 336 + }, + { + "epoch": 4.434210526315789, + "grad_norm": 0.0013512762961909175, + "learning_rate": 1.2563740763617198e-06, + "loss": 0.0, + "step": 337 + }, + { + "epoch": 4.447368421052632, + "grad_norm": 0.002018272178247571, + "learning_rate": 1.2372843503561318e-06, + "loss": 0.0, + "step": 338 + }, + { + "epoch": 4.4605263157894735, + "grad_norm": 0.0042894985526800156, + "learning_rate": 1.218292958076213e-06, + "loss": 0.0, + "step": 339 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 0.0010812514228746295, + "learning_rate": 1.1994013784727948e-06, + "loss": 0.0, + "step": 340 + }, + { + "epoch": 4.4868421052631575, + "grad_norm": 0.0041869827546179295, + "learning_rate": 1.180611082723814e-06, + "loss": 0.0, + "step": 341 + }, + { + "epoch": 4.5, + "grad_norm": 0.010349901393055916, + "learning_rate": 1.161923534119752e-06, + "loss": 0.0, + "step": 342 + }, + { + "epoch": 4.5131578947368425, + "grad_norm": 0.007921607233583927, + "learning_rate": 1.1433401879496723e-06, + "loss": 0.0, + "step": 343 + }, + { + "epoch": 4.526315789473684, + "grad_norm": 0.0013929491396993399, + "learning_rate": 1.1248624913878966e-06, + "loss": 0.0, + "step": 344 + }, + { + "epoch": 4.5394736842105265, + "grad_norm": 0.003571738488972187, + "learning_rate": 1.1064918833813073e-06, + "loss": 0.0, + "step": 345 + }, + { + "epoch": 4.552631578947368, + "grad_norm": 0.0017180831637233496, + "learning_rate": 1.088229794537283e-06, + "loss": 0.0, + "step": 346 + }, + { + "epoch": 4.565789473684211, + "grad_norm": 0.0025168503634631634, + "learning_rate": 1.0700776470122981e-06, + "loss": 0.0, + "step": 347 + }, + { + "epoch": 4.578947368421053, + "grad_norm": 0.010161465033888817, + "learning_rate": 1.0520368544011661e-06, + "loss": 0.0, + "step": 348 + }, + { + "epoch": 4.592105263157895, + "grad_norm": 0.0024475576356053352, + "learning_rate": 1.0341088216269625e-06, + "loss": 0.0, + "step": 349 + }, + { + "epoch": 4.605263157894737, + "grad_norm": 0.0035759019665420055, + "learning_rate": 1.0162949448316089e-06, + "loss": 0.0, + "step": 350 + }, + { + "epoch": 4.618421052631579, + "grad_norm": 0.002835271880030632, + "learning_rate": 9.98596611267158e-07, + "loss": 0.0, + "step": 351 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 0.014862479642033577, + "learning_rate": 9.81015199187753e-07, + "loss": 0.0, + "step": 352 + }, + { + "epoch": 4.644736842105263, + "grad_norm": 0.0009296426433138549, + "learning_rate": 9.63552077742301e-07, + "loss": 0.0, + "step": 353 + }, + { + "epoch": 4.657894736842105, + "grad_norm": 0.002957531251013279, + "learning_rate": 9.462086068678519e-07, + "loss": 0.0, + "step": 354 + }, + { + "epoch": 4.671052631578947, + "grad_norm": 0.0018816434312611818, + "learning_rate": 9.289861371836886e-07, + "loss": 0.0, + "step": 355 + }, + { + "epoch": 4.684210526315789, + "grad_norm": 0.0049733612686395645, + "learning_rate": 9.118860098861538e-07, + "loss": 0.0, + "step": 356 + }, + { + "epoch": 4.697368421052632, + "grad_norm": 0.0017015141202136874, + "learning_rate": 8.949095566441985e-07, + "loss": 0.0, + "step": 357 + }, + { + "epoch": 4.7105263157894735, + "grad_norm": 0.002689789514988661, + "learning_rate": 8.78058099495685e-07, + "loss": 0.0, + "step": 358 + }, + { + "epoch": 4.723684210526316, + "grad_norm": 0.00439384626224637, + "learning_rate": 8.613329507444274e-07, + "loss": 0.0, + "step": 359 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 0.0011757826432585716, + "learning_rate": 8.44735412857999e-07, + "loss": 0.0, + "step": 360 + }, + { + "epoch": 4.75, + "grad_norm": 0.004845093935728073, + "learning_rate": 8.282667783663056e-07, + "loss": 0.0, + "step": 361 + }, + { + "epoch": 4.7631578947368425, + "grad_norm": 0.0011784805683419108, + "learning_rate": 8.119283297609238e-07, + "loss": 0.0, + "step": 362 + }, + { + "epoch": 4.776315789473684, + "grad_norm": 0.024683140218257904, + "learning_rate": 7.957213393952335e-07, + "loss": 0.0001, + "step": 363 + }, + { + "epoch": 4.7894736842105265, + "grad_norm": 0.0012061076704412699, + "learning_rate": 7.796470693853281e-07, + "loss": 0.0, + "step": 364 + }, + { + "epoch": 4.802631578947368, + "grad_norm": 0.003571854904294014, + "learning_rate": 7.637067715117327e-07, + "loss": 0.0, + "step": 365 + }, + { + "epoch": 4.815789473684211, + "grad_norm": 0.0021892369259148836, + "learning_rate": 7.479016871219174e-07, + "loss": 0.0, + "step": 366 + }, + { + "epoch": 4.828947368421053, + "grad_norm": 0.005559162236750126, + "learning_rate": 7.322330470336314e-07, + "loss": 0.0, + "step": 367 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 0.021778438240289688, + "learning_rate": 7.167020714390502e-07, + "loss": 0.0001, + "step": 368 + }, + { + "epoch": 4.855263157894737, + "grad_norm": 0.0014009246369823813, + "learning_rate": 7.013099698097539e-07, + "loss": 0.0, + "step": 369 + }, + { + "epoch": 4.868421052631579, + "grad_norm": 0.002133658155798912, + "learning_rate": 6.860579408025436e-07, + "loss": 0.0, + "step": 370 + }, + { + "epoch": 4.881578947368421, + "grad_norm": 0.01783628761768341, + "learning_rate": 6.709471721660904e-07, + "loss": 0.0001, + "step": 371 + }, + { + "epoch": 4.894736842105263, + "grad_norm": 0.02103368379175663, + "learning_rate": 6.559788406484446e-07, + "loss": 0.0001, + "step": 372 + }, + { + "epoch": 4.907894736842105, + "grad_norm": 0.0016494387527927756, + "learning_rate": 6.41154111905393e-07, + "loss": 0.0, + "step": 373 + }, + { + "epoch": 4.921052631578947, + "grad_norm": 0.00294311111792922, + "learning_rate": 6.264741404096875e-07, + "loss": 0.0, + "step": 374 + }, + { + "epoch": 4.934210526315789, + "grad_norm": 0.0028199541848152876, + "learning_rate": 6.119400693611358e-07, + "loss": 0.0, + "step": 375 + }, + { + "epoch": 4.947368421052632, + "grad_norm": 0.0007045238162390888, + "learning_rate": 5.975530305975808e-07, + "loss": 0.0, + "step": 376 + }, + { + "epoch": 4.9605263157894735, + "grad_norm": 0.0008912527118809521, + "learning_rate": 5.833141445067541e-07, + "loss": 0.0, + "step": 377 + }, + { + "epoch": 4.973684210526316, + "grad_norm": 0.003858179785311222, + "learning_rate": 5.692245199390281e-07, + "loss": 0.0, + "step": 378 + }, + { + "epoch": 4.9868421052631575, + "grad_norm": 0.0017520582769066095, + "learning_rate": 5.552852541210651e-07, + "loss": 0.0, + "step": 379 + }, + { + "epoch": 5.0, + "grad_norm": 0.004236061125993729, + "learning_rate": 5.414974325703687e-07, + "loss": 0.0, + "step": 380 + } + ], + "logging_steps": 1, + "max_steps": 456, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 76, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4446096401561027e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-380/training_args.bin b/checkpoint-380/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5a1edbdcc63a93daa09112168cf20c0f8fcb7512 --- /dev/null +++ b/checkpoint-380/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:041cfaa5bf8383821dea4fa5a9d2eab2caad4644c4cd651398c8b0ab1541b270 +size 7992 diff --git a/checkpoint-380/zero_to_fp32.py b/checkpoint-380/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-380/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-76/README.md b/checkpoint-76/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fdf619c317c2fe82074662582dbd68166b6f9d50 --- /dev/null +++ b/checkpoint-76/README.md @@ -0,0 +1,202 @@ +--- +base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-76/adapter_config.json b/checkpoint-76/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a192388a7b55129be9ad9168abc396b47bbda6f7 --- /dev/null +++ b/checkpoint-76/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "o_proj", + "down_proj", + "gate_proj", + "k_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-76/adapter_model.safetensors b/checkpoint-76/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..454894cfae5ac0aee99fe142f3b8a8f903db8169 --- /dev/null +++ b/checkpoint-76/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ddf38d8e5bdcee61a3cef0eefd8abdabaf8fd8ac8c9222a20af79b3cbbbfc9d +size 10829849744 diff --git a/checkpoint-76/global_step76/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-76/global_step76/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c550f918b7d988cbce152bc906a2860e04c1007 --- /dev/null +++ b/checkpoint-76/global_step76/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99a5ac92e2b51a99f803ada3c4bc01195ea5ea8a8cf5d0050be0324ae4d1da10 +size 21659418140 diff --git a/checkpoint-76/global_step76/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-76/global_step76/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a58d309f73654f0ab73383eb701aff7934e93961 --- /dev/null +++ b/checkpoint-76/global_step76/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf34473dddb473da1d933f9b4971c1db56bbb7a39faa15deff62c8ae95b9447d +size 21659457372 diff --git a/checkpoint-76/global_step76/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-76/global_step76/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6893b5ea87f9150676abc6114f09ab08ea9cb8a3 --- /dev/null +++ b/checkpoint-76/global_step76/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07dc1593969387baab70f62b8078b55eb3faa4076d593bb072d3783c775510ec +size 21659417820 diff --git a/checkpoint-76/global_step76/mp_rank_00_model_states.pt b/checkpoint-76/global_step76/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..58539fde3d7c004ff7b49a1e1c7f5bd386941eb6 --- /dev/null +++ b/checkpoint-76/global_step76/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92b083ac5695003888f71cce2fd08ef6c3da070152619e5d9f493cb6a26e7dfe +size 11918643933 diff --git a/checkpoint-76/latest b/checkpoint-76/latest new file mode 100644 index 0000000000000000000000000000000000000000..3137f19948d5aa563b9948e1161e2ee9665c4f33 --- /dev/null +++ b/checkpoint-76/latest @@ -0,0 +1 @@ +global_step76 \ No newline at end of file diff --git a/checkpoint-76/rng_state_0.pth b/checkpoint-76/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..f0243a35107f9351684b4b491b92475dc82efd3d --- /dev/null +++ b/checkpoint-76/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4caef4c46f6d195a03b7ac9efc175ac61a61d226f3ba835e0fb9bac39e6bc64 +size 14768 diff --git a/checkpoint-76/rng_state_1.pth b/checkpoint-76/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..81018554b8d83534e404729a34be576aba370f9e --- /dev/null +++ b/checkpoint-76/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33674d0cb90389e7c73070665b3a44c29b2a2e3d5ae9dd280aeddf03fcad3db6 +size 14768 diff --git a/checkpoint-76/rng_state_2.pth b/checkpoint-76/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..cb9fddced9c2ef113e1fe83cb8d35aae87b9c46a --- /dev/null +++ b/checkpoint-76/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f86d3f237c14ca71c248f48fd103d5f3f60e3d6f92df22f71396c5e09ff918ae +size 14768 diff --git a/checkpoint-76/scheduler.pt b/checkpoint-76/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7c7c864b80bf300100f0f6e4794b1332e6b06fe --- /dev/null +++ b/checkpoint-76/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c315fe8040ff17806bbc85e0852bb90368308daf9b2756cfe9934a4972441ebd +size 1064 diff --git a/checkpoint-76/special_tokens_map.json b/checkpoint-76/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-76/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-76/tokenizer.json b/checkpoint-76/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-76/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-76/tokenizer_config.json b/checkpoint-76/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fdde94c29816839ec3c29d6c6461206a49911f3c --- /dev/null +++ b/checkpoint-76/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-76/trainer_state.json b/checkpoint-76/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ba4feb375077acfcdb249ee73f08bc12d489eee4 --- /dev/null +++ b/checkpoint-76/trainer_state.json @@ -0,0 +1,565 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 76, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013157894736842105, + "grad_norm": 34.99433898925781, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.595, + "step": 1 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 35.6848258972168, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.6447, + "step": 2 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 35.07997512817383, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.5819, + "step": 3 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 34.3863525390625, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.5739, + "step": 4 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 35.443077087402344, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.6071, + "step": 5 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 34.70173263549805, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.5487, + "step": 6 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 34.421295166015625, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.5494, + "step": 7 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 35.152748107910156, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.5936, + "step": 8 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 34.947021484375, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.5574, + "step": 9 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 34.67315673828125, + "learning_rate": 5.000000000000001e-07, + "loss": 2.4894, + "step": 10 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 34.679954528808594, + "learning_rate": 5.5e-07, + "loss": 2.4985, + "step": 11 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 33.57002258300781, + "learning_rate": 6.000000000000001e-07, + "loss": 2.4339, + "step": 12 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 33.517276763916016, + "learning_rate": 6.5e-07, + "loss": 2.4055, + "step": 13 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 33.5312385559082, + "learning_rate": 7.000000000000001e-07, + "loss": 2.3806, + "step": 14 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 32.01276779174805, + "learning_rate": 7.5e-07, + "loss": 2.2505, + "step": 15 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 31.827980041503906, + "learning_rate": 8.000000000000001e-07, + "loss": 2.1359, + "step": 16 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 31.437101364135742, + "learning_rate": 8.500000000000001e-07, + "loss": 2.1117, + "step": 17 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 30.315187454223633, + "learning_rate": 9.000000000000001e-07, + "loss": 1.9795, + "step": 18 + }, + { + "epoch": 0.25, + "grad_norm": 29.622655868530273, + "learning_rate": 9.500000000000001e-07, + "loss": 1.8472, + "step": 19 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 28.628408432006836, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.7283, + "step": 20 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 27.83180046081543, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.5942, + "step": 21 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 26.911596298217773, + "learning_rate": 1.1e-06, + "loss": 1.4467, + "step": 22 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 25.88102149963379, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3007, + "step": 23 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 25.146381378173828, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.1319, + "step": 24 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 24.800382614135742, + "learning_rate": 1.25e-06, + "loss": 0.9359, + "step": 25 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 24.648332595825195, + "learning_rate": 1.3e-06, + "loss": 0.7054, + "step": 26 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 22.947620391845703, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.5209, + "step": 27 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 17.80010414123535, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.3546, + "step": 28 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 11.841789245605469, + "learning_rate": 1.45e-06, + "loss": 0.26, + "step": 29 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 7.275839805603027, + "learning_rate": 1.5e-06, + "loss": 0.1808, + "step": 30 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 4.6324543952941895, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1464, + "step": 31 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.1281485557556152, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1079, + "step": 32 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 2.062562942504883, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0966, + "step": 33 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 2.1343328952789307, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.088, + "step": 34 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 1.6768524646759033, + "learning_rate": 1.75e-06, + "loss": 0.0783, + "step": 35 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 1.0879229307174683, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0623, + "step": 36 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 0.83177649974823, + "learning_rate": 1.85e-06, + "loss": 0.0655, + "step": 37 + }, + { + "epoch": 0.5, + "grad_norm": 0.5678385496139526, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0565, + "step": 38 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 0.6994458436965942, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0491, + "step": 39 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.711387038230896, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0507, + "step": 40 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 0.7169735431671143, + "learning_rate": 2.05e-06, + "loss": 0.0478, + "step": 41 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.603631317615509, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0507, + "step": 42 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 0.617487907409668, + "learning_rate": 2.15e-06, + "loss": 0.043, + "step": 43 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.4638065993785858, + "learning_rate": 2.2e-06, + "loss": 0.0472, + "step": 44 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.5996385216712952, + "learning_rate": 2.25e-06, + "loss": 0.0429, + "step": 45 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.39118286967277527, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0421, + "step": 46 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 0.3118075728416443, + "learning_rate": 2.35e-06, + "loss": 0.0383, + "step": 47 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.31731992959976196, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.041, + "step": 48 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 0.5413194298744202, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0397, + "step": 49 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.32958006858825684, + "learning_rate": 2.5e-06, + "loss": 0.0355, + "step": 50 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 0.596309244632721, + "learning_rate": 2.55e-06, + "loss": 0.0413, + "step": 51 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.4557362496852875, + "learning_rate": 2.6e-06, + "loss": 0.0461, + "step": 52 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 0.3345410227775574, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0385, + "step": 53 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.3047848343849182, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0383, + "step": 54 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 0.43763449788093567, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.038, + "step": 55 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.26870036125183105, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0374, + "step": 56 + }, + { + "epoch": 0.75, + "grad_norm": 0.38762542605400085, + "learning_rate": 2.85e-06, + "loss": 0.0349, + "step": 57 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.27517396211624146, + "learning_rate": 2.9e-06, + "loss": 0.0398, + "step": 58 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 0.30815261602401733, + "learning_rate": 2.95e-06, + "loss": 0.0364, + "step": 59 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.30011361837387085, + "learning_rate": 3e-06, + "loss": 0.0307, + "step": 60 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 0.3269154727458954, + "learning_rate": 3.05e-06, + "loss": 0.0344, + "step": 61 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.3750869333744049, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0339, + "step": 62 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 0.29285815358161926, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.034, + "step": 63 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.4157550632953644, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0348, + "step": 64 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 0.2852867543697357, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0319, + "step": 65 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.4384031593799591, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0319, + "step": 66 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 0.4003254771232605, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0347, + "step": 67 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.49913832545280457, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0347, + "step": 68 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 0.22642269730567932, + "learning_rate": 3.45e-06, + "loss": 0.0306, + "step": 69 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.34004101157188416, + "learning_rate": 3.5e-06, + "loss": 0.0337, + "step": 70 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 0.21503636240959167, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0311, + "step": 71 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.33802086114883423, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0293, + "step": 72 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 0.2488064169883728, + "learning_rate": 3.65e-06, + "loss": 0.0318, + "step": 73 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.21124528348445892, + "learning_rate": 3.7e-06, + "loss": 0.0293, + "step": 74 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 0.3108712136745453, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0288, + "step": 75 + }, + { + "epoch": 1.0, + "grad_norm": 0.33483418822288513, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.031, + "step": 76 + } + ], + "logging_steps": 1, + "max_steps": 456, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 76, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.889219280312205e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-76/training_args.bin b/checkpoint-76/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5a1edbdcc63a93daa09112168cf20c0f8fcb7512 --- /dev/null +++ b/checkpoint-76/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:041cfaa5bf8383821dea4fa5a9d2eab2caad4644c4cd651398c8b0ab1541b270 +size 7992 diff --git a/checkpoint-76/zero_to_fp32.py b/checkpoint-76/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-76/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..dd4e5ad4ad4c9bafb071dccdbf016fd2b2c567d4 --- /dev/null +++ b/config.json @@ -0,0 +1,52 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 8192, + "initializer_range": 0.02, + "intermediate_size": 28672, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 64, + "num_hidden_layers": 80, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "quantization_config": { + "_load_in_4bit": true, + "_load_in_8bit": false, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_storage": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "llm_int8_enable_fp32_cpu_offload": false, + "llm_int8_has_fp16_weight": false, + "llm_int8_skip_modules": null, + "llm_int8_threshold": 6.0, + "load_in_4bit": true, + "load_in_8bit": false, + "quant_method": "bitsandbytes" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.49.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fdde94c29816839ec3c29d6c6461206a49911f3c --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +}