diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..721f2d0c53abfbaf4d56cbb67c73669aa495bc3c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-152/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-228/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-304/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-380/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-76/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d045d0a3733119f500edd4a685eb2f8e1da115e9 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "up_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-152/README.md b/checkpoint-152/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1b184114a0c28ed3e4c082c18486736dc818166d --- /dev/null +++ b/checkpoint-152/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.3-70B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-152/adapter_config.json b/checkpoint-152/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..763d98264e556bcbc60c63c5b9f70b53c7bbe722 --- /dev/null +++ b/checkpoint-152/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "up_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-152/adapter_model.safetensors b/checkpoint-152/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6015b4b8cbf8efa5e33e64932b8f5b0cc51373c5 --- /dev/null +++ b/checkpoint-152/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ee6c04ff0628403245a0aff20f9ace524305eb5ae003589232461e48959ae2f +size 10829849744 diff --git a/checkpoint-152/global_step152/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-152/global_step152/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b69e7195c5a9e9fe00ba507350f0c168077e66fe --- /dev/null +++ b/checkpoint-152/global_step152/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee7d746d5ff200ceb3b92c507700288562d5a3f3d394377919cd4870e1f2538 +size 21659418140 diff --git a/checkpoint-152/global_step152/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-152/global_step152/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..04e82c155f8cc86d0048a8d45dad2eae70d9ff0e --- /dev/null +++ b/checkpoint-152/global_step152/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dc0b82d6e9ef771b223c4afd3c115bd266a4bb305de31db8bb63df19de903ad +size 21659457372 diff --git a/checkpoint-152/global_step152/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-152/global_step152/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6f92e1de796ef9159ac6c0bcab75b1bea302138 --- /dev/null +++ b/checkpoint-152/global_step152/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfaf15afc8da44353267f2f92672fe0b66c6915733444653d246f89eae4f4860 +size 21659417820 diff --git a/checkpoint-152/global_step152/mp_rank_00_model_states.pt b/checkpoint-152/global_step152/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5585f0619e464bf25de42da08d43657b2be4fe69 --- /dev/null +++ b/checkpoint-152/global_step152/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bb420a06dd2df3159fea7cb337a050c9c23a376b527699b0432632462d24823 +size 11918643933 diff --git a/checkpoint-152/latest b/checkpoint-152/latest new file mode 100644 index 0000000000000000000000000000000000000000..60406aecd15beeaa730a071c614fe2ab5b4c734b --- /dev/null +++ b/checkpoint-152/latest @@ -0,0 +1 @@ +global_step152 \ No newline at end of file diff --git a/checkpoint-152/rng_state_0.pth b/checkpoint-152/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..1273774ecb9e0ba6283be0c2e8531a122e231d68 --- /dev/null +++ b/checkpoint-152/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd406425ef68395a3dcb05f97990b4ddc0a85ccc26e2550b978b0f0905f63fca +size 14768 diff --git a/checkpoint-152/rng_state_1.pth b/checkpoint-152/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7e626e64f550afe32c3368c8e040cd7056a74bb6 --- /dev/null +++ b/checkpoint-152/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:483a6993ec4e0cbec89e300d2a3bbeaf7fff23e01afc2457568a12aad958f9ac +size 14768 diff --git a/checkpoint-152/rng_state_2.pth b/checkpoint-152/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..1019e5921f219a14d8e09734eb68025ace867a77 --- /dev/null +++ b/checkpoint-152/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38f8203e92e896c8414d617110a8f97cc8e8be34d1aec495713321cbbe176d78 +size 14768 diff --git a/checkpoint-152/scheduler.pt b/checkpoint-152/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..92432b11da80f35807484df38118e1ccd8d23aaa --- /dev/null +++ b/checkpoint-152/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fa27436a75b1cdfdf2a48cad061d7983f71c2e5ca468127002dad296770375e +size 1064 diff --git a/checkpoint-152/special_tokens_map.json b/checkpoint-152/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-152/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-152/tokenizer.json b/checkpoint-152/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-152/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-152/tokenizer_config.json b/checkpoint-152/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-152/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-152/trainer_state.json b/checkpoint-152/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3919906ab6e95336c5d159bd15945535de39a994 --- /dev/null +++ b/checkpoint-152/trainer_state.json @@ -0,0 +1,1097 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 152, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013157894736842105, + "grad_norm": 37.79440689086914, + "learning_rate": 5.0000000000000004e-08, + "loss": 3.1402, + "step": 1 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 38.45823287963867, + "learning_rate": 1.0000000000000001e-07, + "loss": 3.1787, + "step": 2 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 38.25625228881836, + "learning_rate": 1.5000000000000002e-07, + "loss": 3.1316, + "step": 3 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 37.2024040222168, + "learning_rate": 2.0000000000000002e-07, + "loss": 3.1011, + "step": 4 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 38.17294692993164, + "learning_rate": 2.5000000000000004e-07, + "loss": 3.133, + "step": 5 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 37.374794006347656, + "learning_rate": 3.0000000000000004e-07, + "loss": 3.0731, + "step": 6 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 37.226966857910156, + "learning_rate": 3.5000000000000004e-07, + "loss": 3.069, + "step": 7 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 38.40094757080078, + "learning_rate": 4.0000000000000003e-07, + "loss": 3.1223, + "step": 8 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 37.86320877075195, + "learning_rate": 4.5000000000000003e-07, + "loss": 3.062, + "step": 9 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 38.02171325683594, + "learning_rate": 5.000000000000001e-07, + "loss": 3.0008, + "step": 10 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 38.5522346496582, + "learning_rate": 5.5e-07, + "loss": 3.0047, + "step": 11 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 37.72829818725586, + "learning_rate": 6.000000000000001e-07, + "loss": 2.9274, + "step": 12 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 38.488494873046875, + "learning_rate": 6.5e-07, + "loss": 2.8727, + "step": 13 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 38.87471389770508, + "learning_rate": 7.000000000000001e-07, + "loss": 2.8422, + "step": 14 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 37.584896087646484, + "learning_rate": 7.5e-07, + "loss": 2.6728, + "step": 15 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 37.04607391357422, + "learning_rate": 8.000000000000001e-07, + "loss": 2.5215, + "step": 16 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 37.30121994018555, + "learning_rate": 8.500000000000001e-07, + "loss": 2.4689, + "step": 17 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 35.99961853027344, + "learning_rate": 9.000000000000001e-07, + "loss": 2.3, + "step": 18 + }, + { + "epoch": 0.25, + "grad_norm": 35.817543029785156, + "learning_rate": 9.500000000000001e-07, + "loss": 2.1423, + "step": 19 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 35.056915283203125, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.9639, + "step": 20 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 34.83850860595703, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.7845, + "step": 21 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 34.32366943359375, + "learning_rate": 1.1e-06, + "loss": 1.5864, + "step": 22 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 33.79611587524414, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.4011, + "step": 23 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 32.596031188964844, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.195, + "step": 24 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 30.045007705688477, + "learning_rate": 1.25e-06, + "loss": 0.9883, + "step": 25 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 24.89093589782715, + "learning_rate": 1.3e-06, + "loss": 0.7669, + "step": 26 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 23.454408645629883, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.6304, + "step": 27 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 19.837312698364258, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.4717, + "step": 28 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 15.185093879699707, + "learning_rate": 1.45e-06, + "loss": 0.363, + "step": 29 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 9.057796478271484, + "learning_rate": 1.5e-06, + "loss": 0.2439, + "step": 30 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 5.976982593536377, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1864, + "step": 31 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.067375421524048, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1134, + "step": 32 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 2.3589119911193848, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0985, + "step": 33 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 2.0044353008270264, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0859, + "step": 34 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 1.4279972314834595, + "learning_rate": 1.75e-06, + "loss": 0.0728, + "step": 35 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.9807674288749695, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.061, + "step": 36 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 0.906160295009613, + "learning_rate": 1.85e-06, + "loss": 0.0676, + "step": 37 + }, + { + "epoch": 0.5, + "grad_norm": 0.8837690353393555, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0622, + "step": 38 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 0.9579435586929321, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0557, + "step": 39 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.8149510622024536, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0555, + "step": 40 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 0.8899760246276855, + "learning_rate": 2.05e-06, + "loss": 0.0517, + "step": 41 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.6007645130157471, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0518, + "step": 42 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 0.48819127678871155, + "learning_rate": 2.15e-06, + "loss": 0.0429, + "step": 43 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.42939358949661255, + "learning_rate": 2.2e-06, + "loss": 0.0459, + "step": 44 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.5706579685211182, + "learning_rate": 2.25e-06, + "loss": 0.0453, + "step": 45 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.3034597337245941, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0421, + "step": 46 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 0.5601783394813538, + "learning_rate": 2.35e-06, + "loss": 0.0411, + "step": 47 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.35388317704200745, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.04, + "step": 48 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 0.48609891533851624, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.04, + "step": 49 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.4638507068157196, + "learning_rate": 2.5e-06, + "loss": 0.0369, + "step": 50 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 0.5685771703720093, + "learning_rate": 2.55e-06, + "loss": 0.0428, + "step": 51 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.46358174085617065, + "learning_rate": 2.6e-06, + "loss": 0.0483, + "step": 52 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 0.35054436326026917, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0391, + "step": 53 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.3350559175014496, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.039, + "step": 54 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 0.2875112295150757, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0383, + "step": 55 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.4492928683757782, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0358, + "step": 56 + }, + { + "epoch": 0.75, + "grad_norm": 0.29484888911247253, + "learning_rate": 2.85e-06, + "loss": 0.0355, + "step": 57 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.36551928520202637, + "learning_rate": 2.9e-06, + "loss": 0.0403, + "step": 58 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 0.4458053708076477, + "learning_rate": 2.95e-06, + "loss": 0.0342, + "step": 59 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.34047460556030273, + "learning_rate": 3e-06, + "loss": 0.0302, + "step": 60 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 0.3420606255531311, + "learning_rate": 3.05e-06, + "loss": 0.034, + "step": 61 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.3902851939201355, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0327, + "step": 62 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 0.29165828227996826, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0341, + "step": 63 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.40872958302497864, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.035, + "step": 64 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 0.36295783519744873, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0323, + "step": 65 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.3857724368572235, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0336, + "step": 66 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 0.3207017481327057, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0332, + "step": 67 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.2903987169265747, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0327, + "step": 68 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 0.3386954963207245, + "learning_rate": 3.45e-06, + "loss": 0.0308, + "step": 69 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.4339621365070343, + "learning_rate": 3.5e-06, + "loss": 0.0361, + "step": 70 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 0.28095564246177673, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0306, + "step": 71 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.4141469895839691, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.028, + "step": 72 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 0.35212820768356323, + "learning_rate": 3.65e-06, + "loss": 0.032, + "step": 73 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.26956063508987427, + "learning_rate": 3.7e-06, + "loss": 0.0294, + "step": 74 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 0.32735681533813477, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0272, + "step": 75 + }, + { + "epoch": 1.0, + "grad_norm": 0.4906782805919647, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0324, + "step": 76 + }, + { + "epoch": 1.013157894736842, + "grad_norm": 0.3451901078224182, + "learning_rate": 3.85e-06, + "loss": 0.0288, + "step": 77 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 0.30598726868629456, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0305, + "step": 78 + }, + { + "epoch": 1.0394736842105263, + "grad_norm": 0.31189921498298645, + "learning_rate": 3.95e-06, + "loss": 0.0274, + "step": 79 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.31895947456359863, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0236, + "step": 80 + }, + { + "epoch": 1.0657894736842106, + "grad_norm": 0.3290308117866516, + "learning_rate": 4.05e-06, + "loss": 0.0284, + "step": 81 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 0.3651576638221741, + "learning_rate": 4.1e-06, + "loss": 0.0274, + "step": 82 + }, + { + "epoch": 1.0921052631578947, + "grad_norm": 0.2393084615468979, + "learning_rate": 4.15e-06, + "loss": 0.0301, + "step": 83 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 0.333898663520813, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0235, + "step": 84 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 0.3287582993507385, + "learning_rate": 4.25e-06, + "loss": 0.0248, + "step": 85 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 0.3432455360889435, + "learning_rate": 4.3e-06, + "loss": 0.026, + "step": 86 + }, + { + "epoch": 1.1447368421052633, + "grad_norm": 0.3176783621311188, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0249, + "step": 87 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.33373433351516724, + "learning_rate": 4.4e-06, + "loss": 0.0251, + "step": 88 + }, + { + "epoch": 1.1710526315789473, + "grad_norm": 0.36087968945503235, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0251, + "step": 89 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.3681696057319641, + "learning_rate": 4.5e-06, + "loss": 0.0276, + "step": 90 + }, + { + "epoch": 1.1973684210526316, + "grad_norm": 0.46539774537086487, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0229, + "step": 91 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.23368288576602936, + "learning_rate": 4.600000000000001e-06, + "loss": 0.021, + "step": 92 + }, + { + "epoch": 1.2236842105263157, + "grad_norm": 0.26623716950416565, + "learning_rate": 4.65e-06, + "loss": 0.0265, + "step": 93 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 0.28750717639923096, + "learning_rate": 4.7e-06, + "loss": 0.0221, + "step": 94 + }, + { + "epoch": 1.25, + "grad_norm": 0.46578383445739746, + "learning_rate": 4.75e-06, + "loss": 0.0236, + "step": 95 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.33406543731689453, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0239, + "step": 96 + }, + { + "epoch": 1.2763157894736843, + "grad_norm": 0.21247217059135437, + "learning_rate": 4.85e-06, + "loss": 0.0188, + "step": 97 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 0.26229164004325867, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.022, + "step": 98 + }, + { + "epoch": 1.3026315789473684, + "grad_norm": 0.2967258393764496, + "learning_rate": 4.95e-06, + "loss": 0.0218, + "step": 99 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.419189453125, + "learning_rate": 5e-06, + "loss": 0.0247, + "step": 100 + }, + { + "epoch": 1.3289473684210527, + "grad_norm": 0.25418952107429504, + "learning_rate": 4.999902656502973e-06, + "loss": 0.0223, + "step": 101 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.20174147188663483, + "learning_rate": 4.9996106335924965e-06, + "loss": 0.0266, + "step": 102 + }, + { + "epoch": 1.3552631578947367, + "grad_norm": 0.21732494235038757, + "learning_rate": 4.999123954009797e-06, + "loss": 0.0188, + "step": 103 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.2683119773864746, + "learning_rate": 4.998442655654946e-06, + "loss": 0.0203, + "step": 104 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 0.18175765872001648, + "learning_rate": 4.997566791583916e-06, + "loss": 0.0185, + "step": 105 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 0.3932501971721649, + "learning_rate": 4.996496430004446e-06, + "loss": 0.0238, + "step": 106 + }, + { + "epoch": 1.4078947368421053, + "grad_norm": 0.31145599484443665, + "learning_rate": 4.995231654270726e-06, + "loss": 0.0199, + "step": 107 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.41356661915779114, + "learning_rate": 4.993772562876909e-06, + "loss": 0.0187, + "step": 108 + }, + { + "epoch": 1.4342105263157894, + "grad_norm": 0.22484919428825378, + "learning_rate": 4.992119269449445e-06, + "loss": 0.0182, + "step": 109 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.28703081607818604, + "learning_rate": 4.990271902738223e-06, + "loss": 0.0239, + "step": 110 + }, + { + "epoch": 1.4605263157894737, + "grad_norm": 0.2394670695066452, + "learning_rate": 4.988230606606552e-06, + "loss": 0.0171, + "step": 111 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.3552885949611664, + "learning_rate": 4.985995540019956e-06, + "loss": 0.0226, + "step": 112 + }, + { + "epoch": 1.486842105263158, + "grad_norm": 0.24968908727169037, + "learning_rate": 4.983566877033791e-06, + "loss": 0.0193, + "step": 113 + }, + { + "epoch": 1.5, + "grad_norm": 0.24420695006847382, + "learning_rate": 4.980944806779698e-06, + "loss": 0.0226, + "step": 114 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 0.34696799516677856, + "learning_rate": 4.9781295334508664e-06, + "loss": 0.02, + "step": 115 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.23682132363319397, + "learning_rate": 4.975121276286136e-06, + "loss": 0.0194, + "step": 116 + }, + { + "epoch": 1.5394736842105263, + "grad_norm": 0.2485751509666443, + "learning_rate": 4.9719202695529265e-06, + "loss": 0.0149, + "step": 117 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.2815033495426178, + "learning_rate": 4.968526762528988e-06, + "loss": 0.0153, + "step": 118 + }, + { + "epoch": 1.5657894736842106, + "grad_norm": 0.24127744138240814, + "learning_rate": 4.964941019482995e-06, + "loss": 0.019, + "step": 119 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.2987695038318634, + "learning_rate": 4.961163319653959e-06, + "loss": 0.0165, + "step": 120 + }, + { + "epoch": 1.5921052631578947, + "grad_norm": 0.33492133021354675, + "learning_rate": 4.9571939572294914e-06, + "loss": 0.0185, + "step": 121 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.20466521382331848, + "learning_rate": 4.953033241322887e-06, + "loss": 0.0151, + "step": 122 + }, + { + "epoch": 1.618421052631579, + "grad_norm": 0.36396247148513794, + "learning_rate": 4.948681495949055e-06, + "loss": 0.0138, + "step": 123 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.2000381350517273, + "learning_rate": 4.944139059999286e-06, + "loss": 0.0125, + "step": 124 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 0.24977952241897583, + "learning_rate": 4.939406287214861e-06, + "loss": 0.0152, + "step": 125 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 0.26705336570739746, + "learning_rate": 4.9344835461595016e-06, + "loss": 0.0148, + "step": 126 + }, + { + "epoch": 1.6710526315789473, + "grad_norm": 0.26699599623680115, + "learning_rate": 4.929371220190671e-06, + "loss": 0.0151, + "step": 127 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.20149633288383484, + "learning_rate": 4.9240697074297205e-06, + "loss": 0.0151, + "step": 128 + }, + { + "epoch": 1.6973684210526314, + "grad_norm": 0.1961003988981247, + "learning_rate": 4.918579420730884e-06, + "loss": 0.0163, + "step": 129 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.2148503214120865, + "learning_rate": 4.912900787649124e-06, + "loss": 0.0137, + "step": 130 + }, + { + "epoch": 1.723684210526316, + "grad_norm": 0.20505128800868988, + "learning_rate": 4.907034250406846e-06, + "loss": 0.0136, + "step": 131 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.19462467730045319, + "learning_rate": 4.900980265859449e-06, + "loss": 0.0139, + "step": 132 + }, + { + "epoch": 1.75, + "grad_norm": 0.21602794528007507, + "learning_rate": 4.894739305459754e-06, + "loss": 0.015, + "step": 133 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 0.22933153808116913, + "learning_rate": 4.88831185522129e-06, + "loss": 0.0142, + "step": 134 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 0.1785646229982376, + "learning_rate": 4.881698415680442e-06, + "loss": 0.0097, + "step": 135 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.21535581350326538, + "learning_rate": 4.874899501857477e-06, + "loss": 0.0106, + "step": 136 + }, + { + "epoch": 1.8026315789473686, + "grad_norm": 0.2360723614692688, + "learning_rate": 4.867915643216434e-06, + "loss": 0.0123, + "step": 137 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.18098825216293335, + "learning_rate": 4.860747383623889e-06, + "loss": 0.0126, + "step": 138 + }, + { + "epoch": 1.8289473684210527, + "grad_norm": 0.1836131066083908, + "learning_rate": 4.85339528130661e-06, + "loss": 0.0125, + "step": 139 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.34765973687171936, + "learning_rate": 4.845859908808074e-06, + "loss": 0.0158, + "step": 140 + }, + { + "epoch": 1.8552631578947367, + "grad_norm": 0.22595159709453583, + "learning_rate": 4.838141852943891e-06, + "loss": 0.0101, + "step": 141 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.2811257243156433, + "learning_rate": 4.830241714756099e-06, + "loss": 0.0111, + "step": 142 + }, + { + "epoch": 1.881578947368421, + "grad_norm": 0.1875840127468109, + "learning_rate": 4.822160109466361e-06, + "loss": 0.0086, + "step": 143 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.19390800595283508, + "learning_rate": 4.813897666428054e-06, + "loss": 0.0106, + "step": 144 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 0.3725268244743347, + "learning_rate": 4.805455029077255e-06, + "loss": 0.0095, + "step": 145 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.2201736867427826, + "learning_rate": 4.79683285488264e-06, + "loss": 0.0074, + "step": 146 + }, + { + "epoch": 1.9342105263157894, + "grad_norm": 0.17423805594444275, + "learning_rate": 4.788031815294282e-06, + "loss": 0.0072, + "step": 147 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 0.22169643640518188, + "learning_rate": 4.779052595691355e-06, + "loss": 0.0121, + "step": 148 + }, + { + "epoch": 1.9605263157894737, + "grad_norm": 0.3247295618057251, + "learning_rate": 4.76989589532877e-06, + "loss": 0.0121, + "step": 149 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.1830369532108307, + "learning_rate": 4.7605624272827125e-06, + "loss": 0.0077, + "step": 150 + }, + { + "epoch": 1.986842105263158, + "grad_norm": 0.2967239022254944, + "learning_rate": 4.75105291839512e-06, + "loss": 0.0104, + "step": 151 + }, + { + "epoch": 2.0, + "grad_norm": 0.17589347064495087, + "learning_rate": 4.741368109217072e-06, + "loss": 0.0075, + "step": 152 + } + ], + "logging_steps": 1, + "max_steps": 456, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 76, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.77843856062441e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-152/training_args.bin b/checkpoint-152/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bd15c797edde13f0f3e0490d0aec249c013df912 --- /dev/null +++ b/checkpoint-152/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ebcaf257fe89a74904f6ea50b526a559eb74a53ebc4dfb373932a4d0fa515f5 +size 7928 diff --git a/checkpoint-152/zero_to_fp32.py b/checkpoint-152/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-152/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-228/README.md b/checkpoint-228/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1b184114a0c28ed3e4c082c18486736dc818166d --- /dev/null +++ b/checkpoint-228/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.3-70B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-228/adapter_config.json b/checkpoint-228/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..763d98264e556bcbc60c63c5b9f70b53c7bbe722 --- /dev/null +++ b/checkpoint-228/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "up_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-228/adapter_model.safetensors b/checkpoint-228/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bdab41222c049449d79a1003853f62a85dec55c4 --- /dev/null +++ b/checkpoint-228/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18fab299726d605d2f5ceb2a11af287f81e2135549b729f0075df0de5f20ad6a +size 10829849744 diff --git a/checkpoint-228/global_step228/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-228/global_step228/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e391abed0c614f6524d84e63ac92a01f9c4305a2 --- /dev/null +++ b/checkpoint-228/global_step228/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aae6a6acd59fa9c511c32a1dd91691882210cb133c903a180bcd7b3bd3f8f9dd +size 21659418140 diff --git a/checkpoint-228/global_step228/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-228/global_step228/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79435d1c62a06b89768c0b30b77dc807647bb34d --- /dev/null +++ b/checkpoint-228/global_step228/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4136b0bdb400646b469fedc212cb6af0720e7e212a5f4817b86eb87fc44b1e8 +size 21659457372 diff --git a/checkpoint-228/global_step228/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-228/global_step228/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..791676147b126d57964031881da2d1f26eb0d39d --- /dev/null +++ b/checkpoint-228/global_step228/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ee1c09b33b0ff556569f218ec8aca8be0a1df4d9f4618c6063e6141b26bb20c +size 21659417820 diff --git a/checkpoint-228/global_step228/mp_rank_00_model_states.pt b/checkpoint-228/global_step228/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bd54e1359a3df13ce38f367e6672a14cce4c5aa --- /dev/null +++ b/checkpoint-228/global_step228/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:687f3ef46063e7ad784945fe4bec3c074ae2191658556817f6489e66ca84f401 +size 11918643933 diff --git a/checkpoint-228/latest b/checkpoint-228/latest new file mode 100644 index 0000000000000000000000000000000000000000..74f667dd5aec7b1dcf458da255b4d04f2e864037 --- /dev/null +++ b/checkpoint-228/latest @@ -0,0 +1 @@ +global_step228 \ No newline at end of file diff --git a/checkpoint-228/rng_state_0.pth b/checkpoint-228/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..7e4e448cfbd2b0add7bf99082d4db1840a91b8ff --- /dev/null +++ b/checkpoint-228/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f597b8a8ff3fa0c1ca0852531a2c83f947d8ea6229f12dcf84cd40e9d2bdd735 +size 14768 diff --git a/checkpoint-228/rng_state_1.pth b/checkpoint-228/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..23d948eb6390eec22634357b14847f9feadb29dc --- /dev/null +++ b/checkpoint-228/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2630593d343fda4e989879bfd0f94abc55cf145925788b6d823f88bb73bfdfe +size 14768 diff --git a/checkpoint-228/rng_state_2.pth b/checkpoint-228/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..8ee328e2bdcbd65a0af107cef8782b2d2759fb7b --- /dev/null +++ b/checkpoint-228/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7138f65fc4992f28f481beb719c5f1191669f411d0001b0b14e2535745da64d +size 14768 diff --git a/checkpoint-228/scheduler.pt b/checkpoint-228/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..281433de4d85a441705e6ae0cdc6d3d9fb9482f3 --- /dev/null +++ b/checkpoint-228/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d7d1e7bc6044d2a4e5e390e5599228af42cccdc946da11715716db6eef73066 +size 1064 diff --git a/checkpoint-228/special_tokens_map.json b/checkpoint-228/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-228/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-228/tokenizer.json b/checkpoint-228/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-228/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-228/tokenizer_config.json b/checkpoint-228/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-228/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-228/trainer_state.json b/checkpoint-228/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4b2428cc9afa27aca4e40c00cc221a0822b471c2 --- /dev/null +++ b/checkpoint-228/trainer_state.json @@ -0,0 +1,1629 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 228, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013157894736842105, + "grad_norm": 37.79440689086914, + "learning_rate": 5.0000000000000004e-08, + "loss": 3.1402, + "step": 1 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 38.45823287963867, + "learning_rate": 1.0000000000000001e-07, + "loss": 3.1787, + "step": 2 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 38.25625228881836, + "learning_rate": 1.5000000000000002e-07, + "loss": 3.1316, + "step": 3 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 37.2024040222168, + "learning_rate": 2.0000000000000002e-07, + "loss": 3.1011, + "step": 4 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 38.17294692993164, + "learning_rate": 2.5000000000000004e-07, + "loss": 3.133, + "step": 5 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 37.374794006347656, + "learning_rate": 3.0000000000000004e-07, + "loss": 3.0731, + "step": 6 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 37.226966857910156, + "learning_rate": 3.5000000000000004e-07, + "loss": 3.069, + "step": 7 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 38.40094757080078, + "learning_rate": 4.0000000000000003e-07, + "loss": 3.1223, + "step": 8 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 37.86320877075195, + "learning_rate": 4.5000000000000003e-07, + "loss": 3.062, + "step": 9 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 38.02171325683594, + "learning_rate": 5.000000000000001e-07, + "loss": 3.0008, + "step": 10 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 38.5522346496582, + "learning_rate": 5.5e-07, + "loss": 3.0047, + "step": 11 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 37.72829818725586, + "learning_rate": 6.000000000000001e-07, + "loss": 2.9274, + "step": 12 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 38.488494873046875, + "learning_rate": 6.5e-07, + "loss": 2.8727, + "step": 13 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 38.87471389770508, + "learning_rate": 7.000000000000001e-07, + "loss": 2.8422, + "step": 14 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 37.584896087646484, + "learning_rate": 7.5e-07, + "loss": 2.6728, + "step": 15 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 37.04607391357422, + "learning_rate": 8.000000000000001e-07, + "loss": 2.5215, + "step": 16 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 37.30121994018555, + "learning_rate": 8.500000000000001e-07, + "loss": 2.4689, + "step": 17 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 35.99961853027344, + "learning_rate": 9.000000000000001e-07, + "loss": 2.3, + "step": 18 + }, + { + "epoch": 0.25, + "grad_norm": 35.817543029785156, + "learning_rate": 9.500000000000001e-07, + "loss": 2.1423, + "step": 19 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 35.056915283203125, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.9639, + "step": 20 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 34.83850860595703, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.7845, + "step": 21 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 34.32366943359375, + "learning_rate": 1.1e-06, + "loss": 1.5864, + "step": 22 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 33.79611587524414, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.4011, + "step": 23 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 32.596031188964844, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.195, + "step": 24 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 30.045007705688477, + "learning_rate": 1.25e-06, + "loss": 0.9883, + "step": 25 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 24.89093589782715, + "learning_rate": 1.3e-06, + "loss": 0.7669, + "step": 26 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 23.454408645629883, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.6304, + "step": 27 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 19.837312698364258, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.4717, + "step": 28 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 15.185093879699707, + "learning_rate": 1.45e-06, + "loss": 0.363, + "step": 29 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 9.057796478271484, + "learning_rate": 1.5e-06, + "loss": 0.2439, + "step": 30 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 5.976982593536377, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1864, + "step": 31 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.067375421524048, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1134, + "step": 32 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 2.3589119911193848, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0985, + "step": 33 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 2.0044353008270264, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0859, + "step": 34 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 1.4279972314834595, + "learning_rate": 1.75e-06, + "loss": 0.0728, + "step": 35 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.9807674288749695, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.061, + "step": 36 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 0.906160295009613, + "learning_rate": 1.85e-06, + "loss": 0.0676, + "step": 37 + }, + { + "epoch": 0.5, + "grad_norm": 0.8837690353393555, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0622, + "step": 38 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 0.9579435586929321, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0557, + "step": 39 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.8149510622024536, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0555, + "step": 40 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 0.8899760246276855, + "learning_rate": 2.05e-06, + "loss": 0.0517, + "step": 41 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.6007645130157471, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0518, + "step": 42 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 0.48819127678871155, + "learning_rate": 2.15e-06, + "loss": 0.0429, + "step": 43 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.42939358949661255, + "learning_rate": 2.2e-06, + "loss": 0.0459, + "step": 44 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.5706579685211182, + "learning_rate": 2.25e-06, + "loss": 0.0453, + "step": 45 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.3034597337245941, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0421, + "step": 46 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 0.5601783394813538, + "learning_rate": 2.35e-06, + "loss": 0.0411, + "step": 47 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.35388317704200745, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.04, + "step": 48 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 0.48609891533851624, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.04, + "step": 49 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.4638507068157196, + "learning_rate": 2.5e-06, + "loss": 0.0369, + "step": 50 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 0.5685771703720093, + "learning_rate": 2.55e-06, + "loss": 0.0428, + "step": 51 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.46358174085617065, + "learning_rate": 2.6e-06, + "loss": 0.0483, + "step": 52 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 0.35054436326026917, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0391, + "step": 53 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.3350559175014496, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.039, + "step": 54 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 0.2875112295150757, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0383, + "step": 55 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.4492928683757782, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0358, + "step": 56 + }, + { + "epoch": 0.75, + "grad_norm": 0.29484888911247253, + "learning_rate": 2.85e-06, + "loss": 0.0355, + "step": 57 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.36551928520202637, + "learning_rate": 2.9e-06, + "loss": 0.0403, + "step": 58 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 0.4458053708076477, + "learning_rate": 2.95e-06, + "loss": 0.0342, + "step": 59 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.34047460556030273, + "learning_rate": 3e-06, + "loss": 0.0302, + "step": 60 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 0.3420606255531311, + "learning_rate": 3.05e-06, + "loss": 0.034, + "step": 61 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.3902851939201355, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0327, + "step": 62 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 0.29165828227996826, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0341, + "step": 63 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.40872958302497864, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.035, + "step": 64 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 0.36295783519744873, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0323, + "step": 65 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.3857724368572235, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0336, + "step": 66 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 0.3207017481327057, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0332, + "step": 67 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.2903987169265747, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0327, + "step": 68 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 0.3386954963207245, + "learning_rate": 3.45e-06, + "loss": 0.0308, + "step": 69 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.4339621365070343, + "learning_rate": 3.5e-06, + "loss": 0.0361, + "step": 70 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 0.28095564246177673, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0306, + "step": 71 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.4141469895839691, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.028, + "step": 72 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 0.35212820768356323, + "learning_rate": 3.65e-06, + "loss": 0.032, + "step": 73 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.26956063508987427, + "learning_rate": 3.7e-06, + "loss": 0.0294, + "step": 74 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 0.32735681533813477, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0272, + "step": 75 + }, + { + "epoch": 1.0, + "grad_norm": 0.4906782805919647, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0324, + "step": 76 + }, + { + "epoch": 1.013157894736842, + "grad_norm": 0.3451901078224182, + "learning_rate": 3.85e-06, + "loss": 0.0288, + "step": 77 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 0.30598726868629456, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0305, + "step": 78 + }, + { + "epoch": 1.0394736842105263, + "grad_norm": 0.31189921498298645, + "learning_rate": 3.95e-06, + "loss": 0.0274, + "step": 79 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.31895947456359863, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0236, + "step": 80 + }, + { + "epoch": 1.0657894736842106, + "grad_norm": 0.3290308117866516, + "learning_rate": 4.05e-06, + "loss": 0.0284, + "step": 81 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 0.3651576638221741, + "learning_rate": 4.1e-06, + "loss": 0.0274, + "step": 82 + }, + { + "epoch": 1.0921052631578947, + "grad_norm": 0.2393084615468979, + "learning_rate": 4.15e-06, + "loss": 0.0301, + "step": 83 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 0.333898663520813, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0235, + "step": 84 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 0.3287582993507385, + "learning_rate": 4.25e-06, + "loss": 0.0248, + "step": 85 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 0.3432455360889435, + "learning_rate": 4.3e-06, + "loss": 0.026, + "step": 86 + }, + { + "epoch": 1.1447368421052633, + "grad_norm": 0.3176783621311188, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0249, + "step": 87 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.33373433351516724, + "learning_rate": 4.4e-06, + "loss": 0.0251, + "step": 88 + }, + { + "epoch": 1.1710526315789473, + "grad_norm": 0.36087968945503235, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0251, + "step": 89 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.3681696057319641, + "learning_rate": 4.5e-06, + "loss": 0.0276, + "step": 90 + }, + { + "epoch": 1.1973684210526316, + "grad_norm": 0.46539774537086487, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0229, + "step": 91 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.23368288576602936, + "learning_rate": 4.600000000000001e-06, + "loss": 0.021, + "step": 92 + }, + { + "epoch": 1.2236842105263157, + "grad_norm": 0.26623716950416565, + "learning_rate": 4.65e-06, + "loss": 0.0265, + "step": 93 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 0.28750717639923096, + "learning_rate": 4.7e-06, + "loss": 0.0221, + "step": 94 + }, + { + "epoch": 1.25, + "grad_norm": 0.46578383445739746, + "learning_rate": 4.75e-06, + "loss": 0.0236, + "step": 95 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.33406543731689453, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0239, + "step": 96 + }, + { + "epoch": 1.2763157894736843, + "grad_norm": 0.21247217059135437, + "learning_rate": 4.85e-06, + "loss": 0.0188, + "step": 97 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 0.26229164004325867, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.022, + "step": 98 + }, + { + "epoch": 1.3026315789473684, + "grad_norm": 0.2967258393764496, + "learning_rate": 4.95e-06, + "loss": 0.0218, + "step": 99 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.419189453125, + "learning_rate": 5e-06, + "loss": 0.0247, + "step": 100 + }, + { + "epoch": 1.3289473684210527, + "grad_norm": 0.25418952107429504, + "learning_rate": 4.999902656502973e-06, + "loss": 0.0223, + "step": 101 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.20174147188663483, + "learning_rate": 4.9996106335924965e-06, + "loss": 0.0266, + "step": 102 + }, + { + "epoch": 1.3552631578947367, + "grad_norm": 0.21732494235038757, + "learning_rate": 4.999123954009797e-06, + "loss": 0.0188, + "step": 103 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.2683119773864746, + "learning_rate": 4.998442655654946e-06, + "loss": 0.0203, + "step": 104 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 0.18175765872001648, + "learning_rate": 4.997566791583916e-06, + "loss": 0.0185, + "step": 105 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 0.3932501971721649, + "learning_rate": 4.996496430004446e-06, + "loss": 0.0238, + "step": 106 + }, + { + "epoch": 1.4078947368421053, + "grad_norm": 0.31145599484443665, + "learning_rate": 4.995231654270726e-06, + "loss": 0.0199, + "step": 107 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.41356661915779114, + "learning_rate": 4.993772562876909e-06, + "loss": 0.0187, + "step": 108 + }, + { + "epoch": 1.4342105263157894, + "grad_norm": 0.22484919428825378, + "learning_rate": 4.992119269449445e-06, + "loss": 0.0182, + "step": 109 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.28703081607818604, + "learning_rate": 4.990271902738223e-06, + "loss": 0.0239, + "step": 110 + }, + { + "epoch": 1.4605263157894737, + "grad_norm": 0.2394670695066452, + "learning_rate": 4.988230606606552e-06, + "loss": 0.0171, + "step": 111 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.3552885949611664, + "learning_rate": 4.985995540019956e-06, + "loss": 0.0226, + "step": 112 + }, + { + "epoch": 1.486842105263158, + "grad_norm": 0.24968908727169037, + "learning_rate": 4.983566877033791e-06, + "loss": 0.0193, + "step": 113 + }, + { + "epoch": 1.5, + "grad_norm": 0.24420695006847382, + "learning_rate": 4.980944806779698e-06, + "loss": 0.0226, + "step": 114 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 0.34696799516677856, + "learning_rate": 4.9781295334508664e-06, + "loss": 0.02, + "step": 115 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.23682132363319397, + "learning_rate": 4.975121276286136e-06, + "loss": 0.0194, + "step": 116 + }, + { + "epoch": 1.5394736842105263, + "grad_norm": 0.2485751509666443, + "learning_rate": 4.9719202695529265e-06, + "loss": 0.0149, + "step": 117 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.2815033495426178, + "learning_rate": 4.968526762528988e-06, + "loss": 0.0153, + "step": 118 + }, + { + "epoch": 1.5657894736842106, + "grad_norm": 0.24127744138240814, + "learning_rate": 4.964941019482995e-06, + "loss": 0.019, + "step": 119 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.2987695038318634, + "learning_rate": 4.961163319653959e-06, + "loss": 0.0165, + "step": 120 + }, + { + "epoch": 1.5921052631578947, + "grad_norm": 0.33492133021354675, + "learning_rate": 4.9571939572294914e-06, + "loss": 0.0185, + "step": 121 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.20466521382331848, + "learning_rate": 4.953033241322887e-06, + "loss": 0.0151, + "step": 122 + }, + { + "epoch": 1.618421052631579, + "grad_norm": 0.36396247148513794, + "learning_rate": 4.948681495949055e-06, + "loss": 0.0138, + "step": 123 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.2000381350517273, + "learning_rate": 4.944139059999286e-06, + "loss": 0.0125, + "step": 124 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 0.24977952241897583, + "learning_rate": 4.939406287214861e-06, + "loss": 0.0152, + "step": 125 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 0.26705336570739746, + "learning_rate": 4.9344835461595016e-06, + "loss": 0.0148, + "step": 126 + }, + { + "epoch": 1.6710526315789473, + "grad_norm": 0.26699599623680115, + "learning_rate": 4.929371220190671e-06, + "loss": 0.0151, + "step": 127 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.20149633288383484, + "learning_rate": 4.9240697074297205e-06, + "loss": 0.0151, + "step": 128 + }, + { + "epoch": 1.6973684210526314, + "grad_norm": 0.1961003988981247, + "learning_rate": 4.918579420730884e-06, + "loss": 0.0163, + "step": 129 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.2148503214120865, + "learning_rate": 4.912900787649124e-06, + "loss": 0.0137, + "step": 130 + }, + { + "epoch": 1.723684210526316, + "grad_norm": 0.20505128800868988, + "learning_rate": 4.907034250406846e-06, + "loss": 0.0136, + "step": 131 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.19462467730045319, + "learning_rate": 4.900980265859449e-06, + "loss": 0.0139, + "step": 132 + }, + { + "epoch": 1.75, + "grad_norm": 0.21602794528007507, + "learning_rate": 4.894739305459754e-06, + "loss": 0.015, + "step": 133 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 0.22933153808116913, + "learning_rate": 4.88831185522129e-06, + "loss": 0.0142, + "step": 134 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 0.1785646229982376, + "learning_rate": 4.881698415680442e-06, + "loss": 0.0097, + "step": 135 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.21535581350326538, + "learning_rate": 4.874899501857477e-06, + "loss": 0.0106, + "step": 136 + }, + { + "epoch": 1.8026315789473686, + "grad_norm": 0.2360723614692688, + "learning_rate": 4.867915643216434e-06, + "loss": 0.0123, + "step": 137 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.18098825216293335, + "learning_rate": 4.860747383623889e-06, + "loss": 0.0126, + "step": 138 + }, + { + "epoch": 1.8289473684210527, + "grad_norm": 0.1836131066083908, + "learning_rate": 4.85339528130661e-06, + "loss": 0.0125, + "step": 139 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.34765973687171936, + "learning_rate": 4.845859908808074e-06, + "loss": 0.0158, + "step": 140 + }, + { + "epoch": 1.8552631578947367, + "grad_norm": 0.22595159709453583, + "learning_rate": 4.838141852943891e-06, + "loss": 0.0101, + "step": 141 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.2811257243156433, + "learning_rate": 4.830241714756099e-06, + "loss": 0.0111, + "step": 142 + }, + { + "epoch": 1.881578947368421, + "grad_norm": 0.1875840127468109, + "learning_rate": 4.822160109466361e-06, + "loss": 0.0086, + "step": 143 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.19390800595283508, + "learning_rate": 4.813897666428054e-06, + "loss": 0.0106, + "step": 144 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 0.3725268244743347, + "learning_rate": 4.805455029077255e-06, + "loss": 0.0095, + "step": 145 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.2201736867427826, + "learning_rate": 4.79683285488264e-06, + "loss": 0.0074, + "step": 146 + }, + { + "epoch": 1.9342105263157894, + "grad_norm": 0.17423805594444275, + "learning_rate": 4.788031815294282e-06, + "loss": 0.0072, + "step": 147 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 0.22169643640518188, + "learning_rate": 4.779052595691355e-06, + "loss": 0.0121, + "step": 148 + }, + { + "epoch": 1.9605263157894737, + "grad_norm": 0.3247295618057251, + "learning_rate": 4.76989589532877e-06, + "loss": 0.0121, + "step": 149 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.1830369532108307, + "learning_rate": 4.7605624272827125e-06, + "loss": 0.0077, + "step": 150 + }, + { + "epoch": 1.986842105263158, + "grad_norm": 0.2967239022254944, + "learning_rate": 4.75105291839512e-06, + "loss": 0.0104, + "step": 151 + }, + { + "epoch": 2.0, + "grad_norm": 0.17589347064495087, + "learning_rate": 4.741368109217072e-06, + "loss": 0.0075, + "step": 152 + }, + { + "epoch": 2.013157894736842, + "grad_norm": 0.15554101765155792, + "learning_rate": 4.7315087539511225e-06, + "loss": 0.0063, + "step": 153 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 0.13191422820091248, + "learning_rate": 4.721475620392567e-06, + "loss": 0.0039, + "step": 154 + }, + { + "epoch": 2.039473684210526, + "grad_norm": 0.1909502148628235, + "learning_rate": 4.711269489869654e-06, + "loss": 0.0055, + "step": 155 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 0.16942323744297028, + "learning_rate": 4.700891157182729e-06, + "loss": 0.0055, + "step": 156 + }, + { + "epoch": 2.0657894736842106, + "grad_norm": 0.1740521341562271, + "learning_rate": 4.690341430542351e-06, + "loss": 0.006, + "step": 157 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 0.19565710425376892, + "learning_rate": 4.679621131506347e-06, + "loss": 0.0057, + "step": 158 + }, + { + "epoch": 2.0921052631578947, + "grad_norm": 0.13488221168518066, + "learning_rate": 4.668731094915835e-06, + "loss": 0.0033, + "step": 159 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.16322746872901917, + "learning_rate": 4.657672168830211e-06, + "loss": 0.0042, + "step": 160 + }, + { + "epoch": 2.1184210526315788, + "grad_norm": 0.2087877243757248, + "learning_rate": 4.646445214461105e-06, + "loss": 0.0052, + "step": 161 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 0.12736408412456512, + "learning_rate": 4.635051106105316e-06, + "loss": 0.0051, + "step": 162 + }, + { + "epoch": 2.1447368421052633, + "grad_norm": 0.13264045119285583, + "learning_rate": 4.623490731076728e-06, + "loss": 0.0035, + "step": 163 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.2015363723039627, + "learning_rate": 4.6117649896372055e-06, + "loss": 0.0055, + "step": 164 + }, + { + "epoch": 2.1710526315789473, + "grad_norm": 0.21640510857105255, + "learning_rate": 4.59987479492649e-06, + "loss": 0.008, + "step": 165 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 0.17276327311992645, + "learning_rate": 4.587821072891089e-06, + "loss": 0.0058, + "step": 166 + }, + { + "epoch": 2.1973684210526314, + "grad_norm": 0.15923018753528595, + "learning_rate": 4.5756047622121665e-06, + "loss": 0.0039, + "step": 167 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.14791631698608398, + "learning_rate": 4.563226814232444e-06, + "loss": 0.0032, + "step": 168 + }, + { + "epoch": 2.223684210526316, + "grad_norm": 0.16776816546916962, + "learning_rate": 4.550688192882115e-06, + "loss": 0.0043, + "step": 169 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.12374848127365112, + "learning_rate": 4.53798987460378e-06, + "loss": 0.0035, + "step": 170 + }, + { + "epoch": 2.25, + "grad_norm": 0.13051433861255646, + "learning_rate": 4.525132848276405e-06, + "loss": 0.0036, + "step": 171 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 0.12607790529727936, + "learning_rate": 4.512118115138315e-06, + "loss": 0.0052, + "step": 172 + }, + { + "epoch": 2.276315789473684, + "grad_norm": 0.09630817174911499, + "learning_rate": 4.498946688709216e-06, + "loss": 0.0031, + "step": 173 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 0.11332327872514725, + "learning_rate": 4.485619594711278e-06, + "loss": 0.0043, + "step": 174 + }, + { + "epoch": 2.3026315789473686, + "grad_norm": 0.16632875800132751, + "learning_rate": 4.4721378709892475e-06, + "loss": 0.005, + "step": 175 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.12856662273406982, + "learning_rate": 4.4585025674296315e-06, + "loss": 0.0031, + "step": 176 + }, + { + "epoch": 2.3289473684210527, + "grad_norm": 0.197174072265625, + "learning_rate": 4.444714745878936e-06, + "loss": 0.0045, + "step": 177 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 0.17151176929473877, + "learning_rate": 4.430775480060973e-06, + "loss": 0.0044, + "step": 178 + }, + { + "epoch": 2.3552631578947367, + "grad_norm": 0.14734052121639252, + "learning_rate": 4.416685855493246e-06, + "loss": 0.0053, + "step": 179 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.13286560773849487, + "learning_rate": 4.4024469694024194e-06, + "loss": 0.0039, + "step": 180 + }, + { + "epoch": 2.3815789473684212, + "grad_norm": 0.1636727899312973, + "learning_rate": 4.388059930638865e-06, + "loss": 0.0039, + "step": 181 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 0.1082785576581955, + "learning_rate": 4.373525859590313e-06, + "loss": 0.0025, + "step": 182 + }, + { + "epoch": 2.4078947368421053, + "grad_norm": 0.1716354638338089, + "learning_rate": 4.358845888094607e-06, + "loss": 0.004, + "step": 183 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.14045757055282593, + "learning_rate": 4.3440211593515556e-06, + "loss": 0.0026, + "step": 184 + }, + { + "epoch": 2.4342105263157894, + "grad_norm": 0.1682705134153366, + "learning_rate": 4.32905282783391e-06, + "loss": 0.0042, + "step": 185 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 0.11872018873691559, + "learning_rate": 4.313942059197457e-06, + "loss": 0.0028, + "step": 186 + }, + { + "epoch": 2.4605263157894735, + "grad_norm": 0.12182936072349548, + "learning_rate": 4.298690030190247e-06, + "loss": 0.0018, + "step": 187 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 0.2031281590461731, + "learning_rate": 4.283297928560951e-06, + "loss": 0.0032, + "step": 188 + }, + { + "epoch": 2.486842105263158, + "grad_norm": 0.0959291160106659, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0015, + "step": 189 + }, + { + "epoch": 2.5, + "grad_norm": 0.15291978418827057, + "learning_rate": 4.252098312878083e-06, + "loss": 0.0036, + "step": 190 + }, + { + "epoch": 2.513157894736842, + "grad_norm": 0.15930163860321045, + "learning_rate": 4.236293228488267e-06, + "loss": 0.0047, + "step": 191 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.2150997817516327, + "learning_rate": 4.220352930614672e-06, + "loss": 0.0038, + "step": 192 + }, + { + "epoch": 2.5394736842105265, + "grad_norm": 0.1317511945962906, + "learning_rate": 4.204278660604767e-06, + "loss": 0.0032, + "step": 193 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 0.07808093726634979, + "learning_rate": 4.1880716702390764e-06, + "loss": 0.0011, + "step": 194 + }, + { + "epoch": 2.5657894736842106, + "grad_norm": 0.13284094631671906, + "learning_rate": 4.171733221633695e-06, + "loss": 0.0037, + "step": 195 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 0.16264718770980835, + "learning_rate": 4.155264587142002e-06, + "loss": 0.0039, + "step": 196 + }, + { + "epoch": 2.5921052631578947, + "grad_norm": 0.10431212931871414, + "learning_rate": 4.138667049255574e-06, + "loss": 0.0023, + "step": 197 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 0.08813079446554184, + "learning_rate": 4.121941900504316e-06, + "loss": 0.0018, + "step": 198 + }, + { + "epoch": 2.6184210526315788, + "grad_norm": 0.22164294123649597, + "learning_rate": 4.105090443355801e-06, + "loss": 0.0037, + "step": 199 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.09111231565475464, + "learning_rate": 4.088113990113846e-06, + "loss": 0.0019, + "step": 200 + }, + { + "epoch": 2.6447368421052633, + "grad_norm": 0.0871724933385849, + "learning_rate": 4.071013862816311e-06, + "loss": 0.0014, + "step": 201 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 0.2138734757900238, + "learning_rate": 4.0537913931321495e-06, + "loss": 0.0022, + "step": 202 + }, + { + "epoch": 2.6710526315789473, + "grad_norm": 0.11238733679056168, + "learning_rate": 4.036447922257699e-06, + "loss": 0.0023, + "step": 203 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.0815015360713005, + "learning_rate": 4.018984800812248e-06, + "loss": 0.0011, + "step": 204 + }, + { + "epoch": 2.6973684210526314, + "grad_norm": 0.304352343082428, + "learning_rate": 4.001403388732842e-06, + "loss": 0.003, + "step": 205 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 0.10469458252191544, + "learning_rate": 3.983705055168391e-06, + "loss": 0.0009, + "step": 206 + }, + { + "epoch": 2.723684210526316, + "grad_norm": 0.1440751701593399, + "learning_rate": 3.965891178373038e-06, + "loss": 0.0025, + "step": 207 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.2173687070608139, + "learning_rate": 3.947963145598833e-06, + "loss": 0.0028, + "step": 208 + }, + { + "epoch": 2.75, + "grad_norm": 0.2922506332397461, + "learning_rate": 3.929922352987702e-06, + "loss": 0.003, + "step": 209 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.18853916227817535, + "learning_rate": 3.911770205462717e-06, + "loss": 0.0014, + "step": 210 + }, + { + "epoch": 2.776315789473684, + "grad_norm": 0.12060266733169556, + "learning_rate": 3.8935081166186935e-06, + "loss": 0.0015, + "step": 211 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 0.14512351155281067, + "learning_rate": 3.875137508612104e-06, + "loss": 0.002, + "step": 212 + }, + { + "epoch": 2.8026315789473686, + "grad_norm": 0.15343990921974182, + "learning_rate": 3.856659812050328e-06, + "loss": 0.0012, + "step": 213 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 0.09639029949903488, + "learning_rate": 3.838076465880248e-06, + "loss": 0.0017, + "step": 214 + }, + { + "epoch": 2.8289473684210527, + "grad_norm": 0.09907295554876328, + "learning_rate": 3.819388917276186e-06, + "loss": 0.0012, + "step": 215 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.05898207053542137, + "learning_rate": 3.8005986215272056e-06, + "loss": 0.0006, + "step": 216 + }, + { + "epoch": 2.8552631578947367, + "grad_norm": 0.10509718954563141, + "learning_rate": 3.7817070419237866e-06, + "loss": 0.0013, + "step": 217 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 0.17495931684970856, + "learning_rate": 3.7627156496438686e-06, + "loss": 0.001, + "step": 218 + }, + { + "epoch": 2.8815789473684212, + "grad_norm": 0.12321923673152924, + "learning_rate": 3.7436259236382797e-06, + "loss": 0.001, + "step": 219 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.11147578060626984, + "learning_rate": 3.7244393505155713e-06, + "loss": 0.0015, + "step": 220 + }, + { + "epoch": 2.9078947368421053, + "grad_norm": 0.06215621903538704, + "learning_rate": 3.7051574244262412e-06, + "loss": 0.0006, + "step": 221 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 0.03004705347120762, + "learning_rate": 3.6857816469463806e-06, + "loss": 0.0002, + "step": 222 + }, + { + "epoch": 2.9342105263157894, + "grad_norm": 0.09312062710523605, + "learning_rate": 3.6663135269607413e-06, + "loss": 0.0014, + "step": 223 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.05324690416455269, + "learning_rate": 3.6467545805452266e-06, + "loss": 0.0005, + "step": 224 + }, + { + "epoch": 2.9605263157894735, + "grad_norm": 0.06438126415014267, + "learning_rate": 3.6271063308488298e-06, + "loss": 0.0005, + "step": 225 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 0.08646634221076965, + "learning_rate": 3.6073703079750204e-06, + "loss": 0.0006, + "step": 226 + }, + { + "epoch": 2.986842105263158, + "grad_norm": 0.05682829022407532, + "learning_rate": 3.5875480488625847e-06, + "loss": 0.0006, + "step": 227 + }, + { + "epoch": 3.0, + "grad_norm": 0.11831886321306229, + "learning_rate": 3.5676410971659404e-06, + "loss": 0.0006, + "step": 228 + } + ], + "logging_steps": 1, + "max_steps": 456, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 76, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4667657840936616e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-228/training_args.bin b/checkpoint-228/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bd15c797edde13f0f3e0490d0aec249c013df912 --- /dev/null +++ b/checkpoint-228/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ebcaf257fe89a74904f6ea50b526a559eb74a53ebc4dfb373932a4d0fa515f5 +size 7928 diff --git a/checkpoint-228/zero_to_fp32.py b/checkpoint-228/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-228/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-304/README.md b/checkpoint-304/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1b184114a0c28ed3e4c082c18486736dc818166d --- /dev/null +++ b/checkpoint-304/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.3-70B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-304/adapter_config.json b/checkpoint-304/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..763d98264e556bcbc60c63c5b9f70b53c7bbe722 --- /dev/null +++ b/checkpoint-304/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "up_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-304/adapter_model.safetensors b/checkpoint-304/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4d3683983b7b09039b8095da811c679df8d463ec --- /dev/null +++ b/checkpoint-304/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0782e530ad6759cc55d8e42fe4991859dad2ca4ff1f4601a0faa88f06d5152d8 +size 10829849744 diff --git a/checkpoint-304/global_step304/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-304/global_step304/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb8aef282684d1a97b787721903b4808052e7e69 --- /dev/null +++ b/checkpoint-304/global_step304/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13f3daa4f2546f4a5efcf1c806a775680a332f7f2fb33101eb96327c03d97763 +size 21659418140 diff --git a/checkpoint-304/global_step304/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-304/global_step304/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e691b2ef6e6ad840c2522079e3070e157fdd3dc1 --- /dev/null +++ b/checkpoint-304/global_step304/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d751ed5a8d3b01c20f42ab3e61bc2b1c4b20ab1c966aa10a08075cee41c7c16 +size 21659457372 diff --git a/checkpoint-304/global_step304/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-304/global_step304/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..248a083dafa39f1e577164331390baf4b1b74f81 --- /dev/null +++ b/checkpoint-304/global_step304/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:197cddc77f298f1974484ed707bb201f68960b230d2c69769978ba7faad90d46 +size 21659417820 diff --git a/checkpoint-304/global_step304/mp_rank_00_model_states.pt b/checkpoint-304/global_step304/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..18d97ce6e9fc696eeda12a5670b11bc37f26bbd7 --- /dev/null +++ b/checkpoint-304/global_step304/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63dc5cefb7fa37a2b35e90209ec149de61aa019306a4b43c92b2b582756a4b5a +size 11918643933 diff --git a/checkpoint-304/latest b/checkpoint-304/latest new file mode 100644 index 0000000000000000000000000000000000000000..3761843487f150944adef329837340fd2ed0b7ff --- /dev/null +++ b/checkpoint-304/latest @@ -0,0 +1 @@ +global_step304 \ No newline at end of file diff --git a/checkpoint-304/rng_state_0.pth b/checkpoint-304/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ff604c8c4474afbed94401dfd5d6c1473f9d3583 --- /dev/null +++ b/checkpoint-304/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99a3d324f952ce2014535fcab16510a458a3013d4a495eadb02ed7fff34e2363 +size 14768 diff --git a/checkpoint-304/rng_state_1.pth b/checkpoint-304/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef8d9c6fa11f39be0d41b6080c2fdbc5dcfb7349 --- /dev/null +++ b/checkpoint-304/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8aa11659794f1549b168457979fc560787dd21e97b4f2ad4e52b23c8576c2de +size 14768 diff --git a/checkpoint-304/rng_state_2.pth b/checkpoint-304/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5626492adca4ab2303b5cbd44ed33ad4523b3c8e --- /dev/null +++ b/checkpoint-304/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6b24784f5720fe4389d0dffe37c832973528a7aafd8842126a0d5a23d49aff4 +size 14768 diff --git a/checkpoint-304/scheduler.pt b/checkpoint-304/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b5db44d423bc56da0906bba71e4a2c62d1f77f9 --- /dev/null +++ b/checkpoint-304/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd1ee26d42c6bb20a44e8946dadecd2cff0ca8ab5815e472d6bc7cfc6f35c116 +size 1064 diff --git a/checkpoint-304/special_tokens_map.json b/checkpoint-304/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-304/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-304/tokenizer.json b/checkpoint-304/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-304/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-304/tokenizer_config.json b/checkpoint-304/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-304/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-304/trainer_state.json b/checkpoint-304/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2d7a09553afc501fce32205da9ad69954491b7fe --- /dev/null +++ b/checkpoint-304/trainer_state.json @@ -0,0 +1,2161 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 304, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013157894736842105, + "grad_norm": 37.79440689086914, + "learning_rate": 5.0000000000000004e-08, + "loss": 3.1402, + "step": 1 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 38.45823287963867, + "learning_rate": 1.0000000000000001e-07, + "loss": 3.1787, + "step": 2 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 38.25625228881836, + "learning_rate": 1.5000000000000002e-07, + "loss": 3.1316, + "step": 3 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 37.2024040222168, + "learning_rate": 2.0000000000000002e-07, + "loss": 3.1011, + "step": 4 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 38.17294692993164, + "learning_rate": 2.5000000000000004e-07, + "loss": 3.133, + "step": 5 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 37.374794006347656, + "learning_rate": 3.0000000000000004e-07, + "loss": 3.0731, + "step": 6 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 37.226966857910156, + "learning_rate": 3.5000000000000004e-07, + "loss": 3.069, + "step": 7 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 38.40094757080078, + "learning_rate": 4.0000000000000003e-07, + "loss": 3.1223, + "step": 8 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 37.86320877075195, + "learning_rate": 4.5000000000000003e-07, + "loss": 3.062, + "step": 9 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 38.02171325683594, + "learning_rate": 5.000000000000001e-07, + "loss": 3.0008, + "step": 10 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 38.5522346496582, + "learning_rate": 5.5e-07, + "loss": 3.0047, + "step": 11 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 37.72829818725586, + "learning_rate": 6.000000000000001e-07, + "loss": 2.9274, + "step": 12 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 38.488494873046875, + "learning_rate": 6.5e-07, + "loss": 2.8727, + "step": 13 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 38.87471389770508, + "learning_rate": 7.000000000000001e-07, + "loss": 2.8422, + "step": 14 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 37.584896087646484, + "learning_rate": 7.5e-07, + "loss": 2.6728, + "step": 15 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 37.04607391357422, + "learning_rate": 8.000000000000001e-07, + "loss": 2.5215, + "step": 16 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 37.30121994018555, + "learning_rate": 8.500000000000001e-07, + "loss": 2.4689, + "step": 17 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 35.99961853027344, + "learning_rate": 9.000000000000001e-07, + "loss": 2.3, + "step": 18 + }, + { + "epoch": 0.25, + "grad_norm": 35.817543029785156, + "learning_rate": 9.500000000000001e-07, + "loss": 2.1423, + "step": 19 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 35.056915283203125, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.9639, + "step": 20 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 34.83850860595703, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.7845, + "step": 21 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 34.32366943359375, + "learning_rate": 1.1e-06, + "loss": 1.5864, + "step": 22 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 33.79611587524414, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.4011, + "step": 23 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 32.596031188964844, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.195, + "step": 24 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 30.045007705688477, + "learning_rate": 1.25e-06, + "loss": 0.9883, + "step": 25 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 24.89093589782715, + "learning_rate": 1.3e-06, + "loss": 0.7669, + "step": 26 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 23.454408645629883, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.6304, + "step": 27 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 19.837312698364258, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.4717, + "step": 28 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 15.185093879699707, + "learning_rate": 1.45e-06, + "loss": 0.363, + "step": 29 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 9.057796478271484, + "learning_rate": 1.5e-06, + "loss": 0.2439, + "step": 30 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 5.976982593536377, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1864, + "step": 31 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.067375421524048, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1134, + "step": 32 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 2.3589119911193848, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0985, + "step": 33 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 2.0044353008270264, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0859, + "step": 34 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 1.4279972314834595, + "learning_rate": 1.75e-06, + "loss": 0.0728, + "step": 35 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.9807674288749695, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.061, + "step": 36 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 0.906160295009613, + "learning_rate": 1.85e-06, + "loss": 0.0676, + "step": 37 + }, + { + "epoch": 0.5, + "grad_norm": 0.8837690353393555, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0622, + "step": 38 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 0.9579435586929321, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0557, + "step": 39 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.8149510622024536, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0555, + "step": 40 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 0.8899760246276855, + "learning_rate": 2.05e-06, + "loss": 0.0517, + "step": 41 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.6007645130157471, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0518, + "step": 42 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 0.48819127678871155, + "learning_rate": 2.15e-06, + "loss": 0.0429, + "step": 43 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.42939358949661255, + "learning_rate": 2.2e-06, + "loss": 0.0459, + "step": 44 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.5706579685211182, + "learning_rate": 2.25e-06, + "loss": 0.0453, + "step": 45 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.3034597337245941, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0421, + "step": 46 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 0.5601783394813538, + "learning_rate": 2.35e-06, + "loss": 0.0411, + "step": 47 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.35388317704200745, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.04, + "step": 48 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 0.48609891533851624, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.04, + "step": 49 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.4638507068157196, + "learning_rate": 2.5e-06, + "loss": 0.0369, + "step": 50 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 0.5685771703720093, + "learning_rate": 2.55e-06, + "loss": 0.0428, + "step": 51 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.46358174085617065, + "learning_rate": 2.6e-06, + "loss": 0.0483, + "step": 52 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 0.35054436326026917, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0391, + "step": 53 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.3350559175014496, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.039, + "step": 54 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 0.2875112295150757, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0383, + "step": 55 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.4492928683757782, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0358, + "step": 56 + }, + { + "epoch": 0.75, + "grad_norm": 0.29484888911247253, + "learning_rate": 2.85e-06, + "loss": 0.0355, + "step": 57 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.36551928520202637, + "learning_rate": 2.9e-06, + "loss": 0.0403, + "step": 58 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 0.4458053708076477, + "learning_rate": 2.95e-06, + "loss": 0.0342, + "step": 59 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.34047460556030273, + "learning_rate": 3e-06, + "loss": 0.0302, + "step": 60 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 0.3420606255531311, + "learning_rate": 3.05e-06, + "loss": 0.034, + "step": 61 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.3902851939201355, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0327, + "step": 62 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 0.29165828227996826, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0341, + "step": 63 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.40872958302497864, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.035, + "step": 64 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 0.36295783519744873, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0323, + "step": 65 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.3857724368572235, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0336, + "step": 66 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 0.3207017481327057, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0332, + "step": 67 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.2903987169265747, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0327, + "step": 68 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 0.3386954963207245, + "learning_rate": 3.45e-06, + "loss": 0.0308, + "step": 69 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.4339621365070343, + "learning_rate": 3.5e-06, + "loss": 0.0361, + "step": 70 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 0.28095564246177673, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0306, + "step": 71 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.4141469895839691, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.028, + "step": 72 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 0.35212820768356323, + "learning_rate": 3.65e-06, + "loss": 0.032, + "step": 73 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.26956063508987427, + "learning_rate": 3.7e-06, + "loss": 0.0294, + "step": 74 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 0.32735681533813477, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0272, + "step": 75 + }, + { + "epoch": 1.0, + "grad_norm": 0.4906782805919647, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0324, + "step": 76 + }, + { + "epoch": 1.013157894736842, + "grad_norm": 0.3451901078224182, + "learning_rate": 3.85e-06, + "loss": 0.0288, + "step": 77 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 0.30598726868629456, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0305, + "step": 78 + }, + { + "epoch": 1.0394736842105263, + "grad_norm": 0.31189921498298645, + "learning_rate": 3.95e-06, + "loss": 0.0274, + "step": 79 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.31895947456359863, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0236, + "step": 80 + }, + { + "epoch": 1.0657894736842106, + "grad_norm": 0.3290308117866516, + "learning_rate": 4.05e-06, + "loss": 0.0284, + "step": 81 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 0.3651576638221741, + "learning_rate": 4.1e-06, + "loss": 0.0274, + "step": 82 + }, + { + "epoch": 1.0921052631578947, + "grad_norm": 0.2393084615468979, + "learning_rate": 4.15e-06, + "loss": 0.0301, + "step": 83 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 0.333898663520813, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0235, + "step": 84 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 0.3287582993507385, + "learning_rate": 4.25e-06, + "loss": 0.0248, + "step": 85 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 0.3432455360889435, + "learning_rate": 4.3e-06, + "loss": 0.026, + "step": 86 + }, + { + "epoch": 1.1447368421052633, + "grad_norm": 0.3176783621311188, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0249, + "step": 87 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.33373433351516724, + "learning_rate": 4.4e-06, + "loss": 0.0251, + "step": 88 + }, + { + "epoch": 1.1710526315789473, + "grad_norm": 0.36087968945503235, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0251, + "step": 89 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.3681696057319641, + "learning_rate": 4.5e-06, + "loss": 0.0276, + "step": 90 + }, + { + "epoch": 1.1973684210526316, + "grad_norm": 0.46539774537086487, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0229, + "step": 91 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.23368288576602936, + "learning_rate": 4.600000000000001e-06, + "loss": 0.021, + "step": 92 + }, + { + "epoch": 1.2236842105263157, + "grad_norm": 0.26623716950416565, + "learning_rate": 4.65e-06, + "loss": 0.0265, + "step": 93 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 0.28750717639923096, + "learning_rate": 4.7e-06, + "loss": 0.0221, + "step": 94 + }, + { + "epoch": 1.25, + "grad_norm": 0.46578383445739746, + "learning_rate": 4.75e-06, + "loss": 0.0236, + "step": 95 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.33406543731689453, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0239, + "step": 96 + }, + { + "epoch": 1.2763157894736843, + "grad_norm": 0.21247217059135437, + "learning_rate": 4.85e-06, + "loss": 0.0188, + "step": 97 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 0.26229164004325867, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.022, + "step": 98 + }, + { + "epoch": 1.3026315789473684, + "grad_norm": 0.2967258393764496, + "learning_rate": 4.95e-06, + "loss": 0.0218, + "step": 99 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.419189453125, + "learning_rate": 5e-06, + "loss": 0.0247, + "step": 100 + }, + { + "epoch": 1.3289473684210527, + "grad_norm": 0.25418952107429504, + "learning_rate": 4.999902656502973e-06, + "loss": 0.0223, + "step": 101 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.20174147188663483, + "learning_rate": 4.9996106335924965e-06, + "loss": 0.0266, + "step": 102 + }, + { + "epoch": 1.3552631578947367, + "grad_norm": 0.21732494235038757, + "learning_rate": 4.999123954009797e-06, + "loss": 0.0188, + "step": 103 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.2683119773864746, + "learning_rate": 4.998442655654946e-06, + "loss": 0.0203, + "step": 104 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 0.18175765872001648, + "learning_rate": 4.997566791583916e-06, + "loss": 0.0185, + "step": 105 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 0.3932501971721649, + "learning_rate": 4.996496430004446e-06, + "loss": 0.0238, + "step": 106 + }, + { + "epoch": 1.4078947368421053, + "grad_norm": 0.31145599484443665, + "learning_rate": 4.995231654270726e-06, + "loss": 0.0199, + "step": 107 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.41356661915779114, + "learning_rate": 4.993772562876909e-06, + "loss": 0.0187, + "step": 108 + }, + { + "epoch": 1.4342105263157894, + "grad_norm": 0.22484919428825378, + "learning_rate": 4.992119269449445e-06, + "loss": 0.0182, + "step": 109 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.28703081607818604, + "learning_rate": 4.990271902738223e-06, + "loss": 0.0239, + "step": 110 + }, + { + "epoch": 1.4605263157894737, + "grad_norm": 0.2394670695066452, + "learning_rate": 4.988230606606552e-06, + "loss": 0.0171, + "step": 111 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.3552885949611664, + "learning_rate": 4.985995540019956e-06, + "loss": 0.0226, + "step": 112 + }, + { + "epoch": 1.486842105263158, + "grad_norm": 0.24968908727169037, + "learning_rate": 4.983566877033791e-06, + "loss": 0.0193, + "step": 113 + }, + { + "epoch": 1.5, + "grad_norm": 0.24420695006847382, + "learning_rate": 4.980944806779698e-06, + "loss": 0.0226, + "step": 114 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 0.34696799516677856, + "learning_rate": 4.9781295334508664e-06, + "loss": 0.02, + "step": 115 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.23682132363319397, + "learning_rate": 4.975121276286136e-06, + "loss": 0.0194, + "step": 116 + }, + { + "epoch": 1.5394736842105263, + "grad_norm": 0.2485751509666443, + "learning_rate": 4.9719202695529265e-06, + "loss": 0.0149, + "step": 117 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.2815033495426178, + "learning_rate": 4.968526762528988e-06, + "loss": 0.0153, + "step": 118 + }, + { + "epoch": 1.5657894736842106, + "grad_norm": 0.24127744138240814, + "learning_rate": 4.964941019482995e-06, + "loss": 0.019, + "step": 119 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.2987695038318634, + "learning_rate": 4.961163319653959e-06, + "loss": 0.0165, + "step": 120 + }, + { + "epoch": 1.5921052631578947, + "grad_norm": 0.33492133021354675, + "learning_rate": 4.9571939572294914e-06, + "loss": 0.0185, + "step": 121 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.20466521382331848, + "learning_rate": 4.953033241322887e-06, + "loss": 0.0151, + "step": 122 + }, + { + "epoch": 1.618421052631579, + "grad_norm": 0.36396247148513794, + "learning_rate": 4.948681495949055e-06, + "loss": 0.0138, + "step": 123 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.2000381350517273, + "learning_rate": 4.944139059999286e-06, + "loss": 0.0125, + "step": 124 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 0.24977952241897583, + "learning_rate": 4.939406287214861e-06, + "loss": 0.0152, + "step": 125 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 0.26705336570739746, + "learning_rate": 4.9344835461595016e-06, + "loss": 0.0148, + "step": 126 + }, + { + "epoch": 1.6710526315789473, + "grad_norm": 0.26699599623680115, + "learning_rate": 4.929371220190671e-06, + "loss": 0.0151, + "step": 127 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.20149633288383484, + "learning_rate": 4.9240697074297205e-06, + "loss": 0.0151, + "step": 128 + }, + { + "epoch": 1.6973684210526314, + "grad_norm": 0.1961003988981247, + "learning_rate": 4.918579420730884e-06, + "loss": 0.0163, + "step": 129 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.2148503214120865, + "learning_rate": 4.912900787649124e-06, + "loss": 0.0137, + "step": 130 + }, + { + "epoch": 1.723684210526316, + "grad_norm": 0.20505128800868988, + "learning_rate": 4.907034250406846e-06, + "loss": 0.0136, + "step": 131 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.19462467730045319, + "learning_rate": 4.900980265859449e-06, + "loss": 0.0139, + "step": 132 + }, + { + "epoch": 1.75, + "grad_norm": 0.21602794528007507, + "learning_rate": 4.894739305459754e-06, + "loss": 0.015, + "step": 133 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 0.22933153808116913, + "learning_rate": 4.88831185522129e-06, + "loss": 0.0142, + "step": 134 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 0.1785646229982376, + "learning_rate": 4.881698415680442e-06, + "loss": 0.0097, + "step": 135 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.21535581350326538, + "learning_rate": 4.874899501857477e-06, + "loss": 0.0106, + "step": 136 + }, + { + "epoch": 1.8026315789473686, + "grad_norm": 0.2360723614692688, + "learning_rate": 4.867915643216434e-06, + "loss": 0.0123, + "step": 137 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.18098825216293335, + "learning_rate": 4.860747383623889e-06, + "loss": 0.0126, + "step": 138 + }, + { + "epoch": 1.8289473684210527, + "grad_norm": 0.1836131066083908, + "learning_rate": 4.85339528130661e-06, + "loss": 0.0125, + "step": 139 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.34765973687171936, + "learning_rate": 4.845859908808074e-06, + "loss": 0.0158, + "step": 140 + }, + { + "epoch": 1.8552631578947367, + "grad_norm": 0.22595159709453583, + "learning_rate": 4.838141852943891e-06, + "loss": 0.0101, + "step": 141 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.2811257243156433, + "learning_rate": 4.830241714756099e-06, + "loss": 0.0111, + "step": 142 + }, + { + "epoch": 1.881578947368421, + "grad_norm": 0.1875840127468109, + "learning_rate": 4.822160109466361e-06, + "loss": 0.0086, + "step": 143 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.19390800595283508, + "learning_rate": 4.813897666428054e-06, + "loss": 0.0106, + "step": 144 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 0.3725268244743347, + "learning_rate": 4.805455029077255e-06, + "loss": 0.0095, + "step": 145 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.2201736867427826, + "learning_rate": 4.79683285488264e-06, + "loss": 0.0074, + "step": 146 + }, + { + "epoch": 1.9342105263157894, + "grad_norm": 0.17423805594444275, + "learning_rate": 4.788031815294282e-06, + "loss": 0.0072, + "step": 147 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 0.22169643640518188, + "learning_rate": 4.779052595691355e-06, + "loss": 0.0121, + "step": 148 + }, + { + "epoch": 1.9605263157894737, + "grad_norm": 0.3247295618057251, + "learning_rate": 4.76989589532877e-06, + "loss": 0.0121, + "step": 149 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.1830369532108307, + "learning_rate": 4.7605624272827125e-06, + "loss": 0.0077, + "step": 150 + }, + { + "epoch": 1.986842105263158, + "grad_norm": 0.2967239022254944, + "learning_rate": 4.75105291839512e-06, + "loss": 0.0104, + "step": 151 + }, + { + "epoch": 2.0, + "grad_norm": 0.17589347064495087, + "learning_rate": 4.741368109217072e-06, + "loss": 0.0075, + "step": 152 + }, + { + "epoch": 2.013157894736842, + "grad_norm": 0.15554101765155792, + "learning_rate": 4.7315087539511225e-06, + "loss": 0.0063, + "step": 153 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 0.13191422820091248, + "learning_rate": 4.721475620392567e-06, + "loss": 0.0039, + "step": 154 + }, + { + "epoch": 2.039473684210526, + "grad_norm": 0.1909502148628235, + "learning_rate": 4.711269489869654e-06, + "loss": 0.0055, + "step": 155 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 0.16942323744297028, + "learning_rate": 4.700891157182729e-06, + "loss": 0.0055, + "step": 156 + }, + { + "epoch": 2.0657894736842106, + "grad_norm": 0.1740521341562271, + "learning_rate": 4.690341430542351e-06, + "loss": 0.006, + "step": 157 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 0.19565710425376892, + "learning_rate": 4.679621131506347e-06, + "loss": 0.0057, + "step": 158 + }, + { + "epoch": 2.0921052631578947, + "grad_norm": 0.13488221168518066, + "learning_rate": 4.668731094915835e-06, + "loss": 0.0033, + "step": 159 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.16322746872901917, + "learning_rate": 4.657672168830211e-06, + "loss": 0.0042, + "step": 160 + }, + { + "epoch": 2.1184210526315788, + "grad_norm": 0.2087877243757248, + "learning_rate": 4.646445214461105e-06, + "loss": 0.0052, + "step": 161 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 0.12736408412456512, + "learning_rate": 4.635051106105316e-06, + "loss": 0.0051, + "step": 162 + }, + { + "epoch": 2.1447368421052633, + "grad_norm": 0.13264045119285583, + "learning_rate": 4.623490731076728e-06, + "loss": 0.0035, + "step": 163 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.2015363723039627, + "learning_rate": 4.6117649896372055e-06, + "loss": 0.0055, + "step": 164 + }, + { + "epoch": 2.1710526315789473, + "grad_norm": 0.21640510857105255, + "learning_rate": 4.59987479492649e-06, + "loss": 0.008, + "step": 165 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 0.17276327311992645, + "learning_rate": 4.587821072891089e-06, + "loss": 0.0058, + "step": 166 + }, + { + "epoch": 2.1973684210526314, + "grad_norm": 0.15923018753528595, + "learning_rate": 4.5756047622121665e-06, + "loss": 0.0039, + "step": 167 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.14791631698608398, + "learning_rate": 4.563226814232444e-06, + "loss": 0.0032, + "step": 168 + }, + { + "epoch": 2.223684210526316, + "grad_norm": 0.16776816546916962, + "learning_rate": 4.550688192882115e-06, + "loss": 0.0043, + "step": 169 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.12374848127365112, + "learning_rate": 4.53798987460378e-06, + "loss": 0.0035, + "step": 170 + }, + { + "epoch": 2.25, + "grad_norm": 0.13051433861255646, + "learning_rate": 4.525132848276405e-06, + "loss": 0.0036, + "step": 171 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 0.12607790529727936, + "learning_rate": 4.512118115138315e-06, + "loss": 0.0052, + "step": 172 + }, + { + "epoch": 2.276315789473684, + "grad_norm": 0.09630817174911499, + "learning_rate": 4.498946688709216e-06, + "loss": 0.0031, + "step": 173 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 0.11332327872514725, + "learning_rate": 4.485619594711278e-06, + "loss": 0.0043, + "step": 174 + }, + { + "epoch": 2.3026315789473686, + "grad_norm": 0.16632875800132751, + "learning_rate": 4.4721378709892475e-06, + "loss": 0.005, + "step": 175 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.12856662273406982, + "learning_rate": 4.4585025674296315e-06, + "loss": 0.0031, + "step": 176 + }, + { + "epoch": 2.3289473684210527, + "grad_norm": 0.197174072265625, + "learning_rate": 4.444714745878936e-06, + "loss": 0.0045, + "step": 177 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 0.17151176929473877, + "learning_rate": 4.430775480060973e-06, + "loss": 0.0044, + "step": 178 + }, + { + "epoch": 2.3552631578947367, + "grad_norm": 0.14734052121639252, + "learning_rate": 4.416685855493246e-06, + "loss": 0.0053, + "step": 179 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.13286560773849487, + "learning_rate": 4.4024469694024194e-06, + "loss": 0.0039, + "step": 180 + }, + { + "epoch": 2.3815789473684212, + "grad_norm": 0.1636727899312973, + "learning_rate": 4.388059930638865e-06, + "loss": 0.0039, + "step": 181 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 0.1082785576581955, + "learning_rate": 4.373525859590313e-06, + "loss": 0.0025, + "step": 182 + }, + { + "epoch": 2.4078947368421053, + "grad_norm": 0.1716354638338089, + "learning_rate": 4.358845888094607e-06, + "loss": 0.004, + "step": 183 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.14045757055282593, + "learning_rate": 4.3440211593515556e-06, + "loss": 0.0026, + "step": 184 + }, + { + "epoch": 2.4342105263157894, + "grad_norm": 0.1682705134153366, + "learning_rate": 4.32905282783391e-06, + "loss": 0.0042, + "step": 185 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 0.11872018873691559, + "learning_rate": 4.313942059197457e-06, + "loss": 0.0028, + "step": 186 + }, + { + "epoch": 2.4605263157894735, + "grad_norm": 0.12182936072349548, + "learning_rate": 4.298690030190247e-06, + "loss": 0.0018, + "step": 187 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 0.2031281590461731, + "learning_rate": 4.283297928560951e-06, + "loss": 0.0032, + "step": 188 + }, + { + "epoch": 2.486842105263158, + "grad_norm": 0.0959291160106659, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0015, + "step": 189 + }, + { + "epoch": 2.5, + "grad_norm": 0.15291978418827057, + "learning_rate": 4.252098312878083e-06, + "loss": 0.0036, + "step": 190 + }, + { + "epoch": 2.513157894736842, + "grad_norm": 0.15930163860321045, + "learning_rate": 4.236293228488267e-06, + "loss": 0.0047, + "step": 191 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.2150997817516327, + "learning_rate": 4.220352930614672e-06, + "loss": 0.0038, + "step": 192 + }, + { + "epoch": 2.5394736842105265, + "grad_norm": 0.1317511945962906, + "learning_rate": 4.204278660604767e-06, + "loss": 0.0032, + "step": 193 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 0.07808093726634979, + "learning_rate": 4.1880716702390764e-06, + "loss": 0.0011, + "step": 194 + }, + { + "epoch": 2.5657894736842106, + "grad_norm": 0.13284094631671906, + "learning_rate": 4.171733221633695e-06, + "loss": 0.0037, + "step": 195 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 0.16264718770980835, + "learning_rate": 4.155264587142002e-06, + "loss": 0.0039, + "step": 196 + }, + { + "epoch": 2.5921052631578947, + "grad_norm": 0.10431212931871414, + "learning_rate": 4.138667049255574e-06, + "loss": 0.0023, + "step": 197 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 0.08813079446554184, + "learning_rate": 4.121941900504316e-06, + "loss": 0.0018, + "step": 198 + }, + { + "epoch": 2.6184210526315788, + "grad_norm": 0.22164294123649597, + "learning_rate": 4.105090443355801e-06, + "loss": 0.0037, + "step": 199 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.09111231565475464, + "learning_rate": 4.088113990113846e-06, + "loss": 0.0019, + "step": 200 + }, + { + "epoch": 2.6447368421052633, + "grad_norm": 0.0871724933385849, + "learning_rate": 4.071013862816311e-06, + "loss": 0.0014, + "step": 201 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 0.2138734757900238, + "learning_rate": 4.0537913931321495e-06, + "loss": 0.0022, + "step": 202 + }, + { + "epoch": 2.6710526315789473, + "grad_norm": 0.11238733679056168, + "learning_rate": 4.036447922257699e-06, + "loss": 0.0023, + "step": 203 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.0815015360713005, + "learning_rate": 4.018984800812248e-06, + "loss": 0.0011, + "step": 204 + }, + { + "epoch": 2.6973684210526314, + "grad_norm": 0.304352343082428, + "learning_rate": 4.001403388732842e-06, + "loss": 0.003, + "step": 205 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 0.10469458252191544, + "learning_rate": 3.983705055168391e-06, + "loss": 0.0009, + "step": 206 + }, + { + "epoch": 2.723684210526316, + "grad_norm": 0.1440751701593399, + "learning_rate": 3.965891178373038e-06, + "loss": 0.0025, + "step": 207 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.2173687070608139, + "learning_rate": 3.947963145598833e-06, + "loss": 0.0028, + "step": 208 + }, + { + "epoch": 2.75, + "grad_norm": 0.2922506332397461, + "learning_rate": 3.929922352987702e-06, + "loss": 0.003, + "step": 209 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.18853916227817535, + "learning_rate": 3.911770205462717e-06, + "loss": 0.0014, + "step": 210 + }, + { + "epoch": 2.776315789473684, + "grad_norm": 0.12060266733169556, + "learning_rate": 3.8935081166186935e-06, + "loss": 0.0015, + "step": 211 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 0.14512351155281067, + "learning_rate": 3.875137508612104e-06, + "loss": 0.002, + "step": 212 + }, + { + "epoch": 2.8026315789473686, + "grad_norm": 0.15343990921974182, + "learning_rate": 3.856659812050328e-06, + "loss": 0.0012, + "step": 213 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 0.09639029949903488, + "learning_rate": 3.838076465880248e-06, + "loss": 0.0017, + "step": 214 + }, + { + "epoch": 2.8289473684210527, + "grad_norm": 0.09907295554876328, + "learning_rate": 3.819388917276186e-06, + "loss": 0.0012, + "step": 215 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.05898207053542137, + "learning_rate": 3.8005986215272056e-06, + "loss": 0.0006, + "step": 216 + }, + { + "epoch": 2.8552631578947367, + "grad_norm": 0.10509718954563141, + "learning_rate": 3.7817070419237866e-06, + "loss": 0.0013, + "step": 217 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 0.17495931684970856, + "learning_rate": 3.7627156496438686e-06, + "loss": 0.001, + "step": 218 + }, + { + "epoch": 2.8815789473684212, + "grad_norm": 0.12321923673152924, + "learning_rate": 3.7436259236382797e-06, + "loss": 0.001, + "step": 219 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.11147578060626984, + "learning_rate": 3.7244393505155713e-06, + "loss": 0.0015, + "step": 220 + }, + { + "epoch": 2.9078947368421053, + "grad_norm": 0.06215621903538704, + "learning_rate": 3.7051574244262412e-06, + "loss": 0.0006, + "step": 221 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 0.03004705347120762, + "learning_rate": 3.6857816469463806e-06, + "loss": 0.0002, + "step": 222 + }, + { + "epoch": 2.9342105263157894, + "grad_norm": 0.09312062710523605, + "learning_rate": 3.6663135269607413e-06, + "loss": 0.0014, + "step": 223 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.05324690416455269, + "learning_rate": 3.6467545805452266e-06, + "loss": 0.0005, + "step": 224 + }, + { + "epoch": 2.9605263157894735, + "grad_norm": 0.06438126415014267, + "learning_rate": 3.6271063308488298e-06, + "loss": 0.0005, + "step": 225 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 0.08646634221076965, + "learning_rate": 3.6073703079750204e-06, + "loss": 0.0006, + "step": 226 + }, + { + "epoch": 2.986842105263158, + "grad_norm": 0.05682829022407532, + "learning_rate": 3.5875480488625847e-06, + "loss": 0.0006, + "step": 227 + }, + { + "epoch": 3.0, + "grad_norm": 0.11831886321306229, + "learning_rate": 3.5676410971659404e-06, + "loss": 0.0006, + "step": 228 + }, + { + "epoch": 3.013157894736842, + "grad_norm": 0.0533737950026989, + "learning_rate": 3.547651003134921e-06, + "loss": 0.0003, + "step": 229 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 0.06704334169626236, + "learning_rate": 3.527579323494055e-06, + "loss": 0.0005, + "step": 230 + }, + { + "epoch": 3.039473684210526, + "grad_norm": 0.024390004575252533, + "learning_rate": 3.507427621321331e-06, + "loss": 0.0002, + "step": 231 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 0.10754281282424927, + "learning_rate": 3.4871974659264786e-06, + "loss": 0.0009, + "step": 232 + }, + { + "epoch": 3.0657894736842106, + "grad_norm": 0.032474737614393234, + "learning_rate": 3.466890432728754e-06, + "loss": 0.0002, + "step": 233 + }, + { + "epoch": 3.0789473684210527, + "grad_norm": 0.11489477753639221, + "learning_rate": 3.446508103134259e-06, + "loss": 0.0008, + "step": 234 + }, + { + "epoch": 3.0921052631578947, + "grad_norm": 0.11805123090744019, + "learning_rate": 3.426052064412785e-06, + "loss": 0.0013, + "step": 235 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 0.04284543916583061, + "learning_rate": 3.4055239095742067e-06, + "loss": 0.0004, + "step": 236 + }, + { + "epoch": 3.1184210526315788, + "grad_norm": 0.0592227578163147, + "learning_rate": 3.3849252372444295e-06, + "loss": 0.0005, + "step": 237 + }, + { + "epoch": 3.1315789473684212, + "grad_norm": 0.08888686448335648, + "learning_rate": 3.364257651540891e-06, + "loss": 0.0007, + "step": 238 + }, + { + "epoch": 3.1447368421052633, + "grad_norm": 0.04613477736711502, + "learning_rate": 3.343522761947646e-06, + "loss": 0.0004, + "step": 239 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.024109596386551857, + "learning_rate": 3.322722183190025e-06, + "loss": 0.0002, + "step": 240 + }, + { + "epoch": 3.1710526315789473, + "grad_norm": 0.04759601503610611, + "learning_rate": 3.3018575351088894e-06, + "loss": 0.0002, + "step": 241 + }, + { + "epoch": 3.1842105263157894, + "grad_norm": 0.06583128869533539, + "learning_rate": 3.280930442534486e-06, + "loss": 0.0003, + "step": 242 + }, + { + "epoch": 3.1973684210526314, + "grad_norm": 0.03406457230448723, + "learning_rate": 3.2599425351599136e-06, + "loss": 0.0001, + "step": 243 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 0.020209595561027527, + "learning_rate": 3.238895447414211e-06, + "loss": 0.0002, + "step": 244 + }, + { + "epoch": 3.223684210526316, + "grad_norm": 0.04148108884692192, + "learning_rate": 3.217790818335077e-06, + "loss": 0.0002, + "step": 245 + }, + { + "epoch": 3.236842105263158, + "grad_norm": 0.05275535210967064, + "learning_rate": 3.196630291441231e-06, + "loss": 0.0003, + "step": 246 + }, + { + "epoch": 3.25, + "grad_norm": 0.03692334517836571, + "learning_rate": 3.175415514604422e-06, + "loss": 0.0004, + "step": 247 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 0.06586624681949615, + "learning_rate": 3.154148139921102e-06, + "loss": 0.0004, + "step": 248 + }, + { + "epoch": 3.276315789473684, + "grad_norm": 0.06966730207204819, + "learning_rate": 3.132829823583771e-06, + "loss": 0.0003, + "step": 249 + }, + { + "epoch": 3.2894736842105265, + "grad_norm": 0.10422863811254501, + "learning_rate": 3.1114622257520004e-06, + "loss": 0.0004, + "step": 250 + }, + { + "epoch": 3.3026315789473686, + "grad_norm": 0.10132399946451187, + "learning_rate": 3.0900470104231456e-06, + "loss": 0.0008, + "step": 251 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 0.06418923288583755, + "learning_rate": 3.0685858453027668e-06, + "loss": 0.0004, + "step": 252 + }, + { + "epoch": 3.3289473684210527, + "grad_norm": 0.1488313227891922, + "learning_rate": 3.047080401674754e-06, + "loss": 0.0021, + "step": 253 + }, + { + "epoch": 3.3421052631578947, + "grad_norm": 0.05905044823884964, + "learning_rate": 3.0255323542711784e-06, + "loss": 0.0006, + "step": 254 + }, + { + "epoch": 3.3552631578947367, + "grad_norm": 0.05011991411447525, + "learning_rate": 3.00394338114187e-06, + "loss": 0.0004, + "step": 255 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.026315532624721527, + "learning_rate": 2.9823151635237424e-06, + "loss": 0.0002, + "step": 256 + }, + { + "epoch": 3.3815789473684212, + "grad_norm": 0.031153831630945206, + "learning_rate": 2.9606493857098657e-06, + "loss": 0.0003, + "step": 257 + }, + { + "epoch": 3.3947368421052633, + "grad_norm": 0.019811732694506645, + "learning_rate": 2.938947734918302e-06, + "loss": 0.0001, + "step": 258 + }, + { + "epoch": 3.4078947368421053, + "grad_norm": 0.029009979218244553, + "learning_rate": 2.9172119011607153e-06, + "loss": 0.0002, + "step": 259 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 0.08594539761543274, + "learning_rate": 2.8954435771107604e-06, + "loss": 0.0003, + "step": 260 + }, + { + "epoch": 3.4342105263157894, + "grad_norm": 0.07609947770833969, + "learning_rate": 2.8736444579722665e-06, + "loss": 0.0006, + "step": 261 + }, + { + "epoch": 3.4473684210526314, + "grad_norm": 0.052105486392974854, + "learning_rate": 2.8518162413472266e-06, + "loss": 0.0003, + "step": 262 + }, + { + "epoch": 3.4605263157894735, + "grad_norm": 0.023044012486934662, + "learning_rate": 2.8299606271035913e-06, + "loss": 0.0001, + "step": 263 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 0.01714818924665451, + "learning_rate": 2.8080793172428965e-06, + "loss": 0.0001, + "step": 264 + }, + { + "epoch": 3.486842105263158, + "grad_norm": 0.024353889748454094, + "learning_rate": 2.786174015767721e-06, + "loss": 0.0002, + "step": 265 + }, + { + "epoch": 3.5, + "grad_norm": 0.044456806033849716, + "learning_rate": 2.764246428548983e-06, + "loss": 0.0004, + "step": 266 + }, + { + "epoch": 3.513157894736842, + "grad_norm": 0.06826099753379822, + "learning_rate": 2.742298263193099e-06, + "loss": 0.0005, + "step": 267 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 0.2765248417854309, + "learning_rate": 2.720331228909005e-06, + "loss": 0.0005, + "step": 268 + }, + { + "epoch": 3.5394736842105265, + "grad_norm": 0.04589018225669861, + "learning_rate": 2.6983470363750497e-06, + "loss": 0.0004, + "step": 269 + }, + { + "epoch": 3.5526315789473686, + "grad_norm": 0.023166710510849953, + "learning_rate": 2.6763473976057776e-06, + "loss": 0.0001, + "step": 270 + }, + { + "epoch": 3.5657894736842106, + "grad_norm": 0.03657109662890434, + "learning_rate": 2.6543340258186063e-06, + "loss": 0.0002, + "step": 271 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.08881364017724991, + "learning_rate": 2.6323086353004077e-06, + "loss": 0.0004, + "step": 272 + }, + { + "epoch": 3.5921052631578947, + "grad_norm": 0.022605225443840027, + "learning_rate": 2.610272941274012e-06, + "loss": 0.0001, + "step": 273 + }, + { + "epoch": 3.6052631578947367, + "grad_norm": 0.05161530151963234, + "learning_rate": 2.588228659764632e-06, + "loss": 0.0002, + "step": 274 + }, + { + "epoch": 3.6184210526315788, + "grad_norm": 0.039631813764572144, + "learning_rate": 2.5661775074662276e-06, + "loss": 0.0003, + "step": 275 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 0.03075975738465786, + "learning_rate": 2.544121201607822e-06, + "loss": 0.0001, + "step": 276 + }, + { + "epoch": 3.6447368421052633, + "grad_norm": 0.04068103805184364, + "learning_rate": 2.5220614598197708e-06, + "loss": 0.0001, + "step": 277 + }, + { + "epoch": 3.6578947368421053, + "grad_norm": 0.08374299108982086, + "learning_rate": 2.5e-06, + "loss": 0.0006, + "step": 278 + }, + { + "epoch": 3.6710526315789473, + "grad_norm": 0.014881132170557976, + "learning_rate": 2.477938540180231e-06, + "loss": 0.0001, + "step": 279 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 0.08170844614505768, + "learning_rate": 2.455878798392179e-06, + "loss": 0.0004, + "step": 280 + }, + { + "epoch": 3.6973684210526314, + "grad_norm": 0.02605108916759491, + "learning_rate": 2.433822492533774e-06, + "loss": 0.0001, + "step": 281 + }, + { + "epoch": 3.7105263157894735, + "grad_norm": 0.056749701499938965, + "learning_rate": 2.411771340235369e-06, + "loss": 0.0008, + "step": 282 + }, + { + "epoch": 3.723684210526316, + "grad_norm": 0.043280281126499176, + "learning_rate": 2.389727058725989e-06, + "loss": 0.0002, + "step": 283 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 0.01662975363433361, + "learning_rate": 2.3676913646995923e-06, + "loss": 0.0001, + "step": 284 + }, + { + "epoch": 3.75, + "grad_norm": 0.045024238526821136, + "learning_rate": 2.3456659741813945e-06, + "loss": 0.0001, + "step": 285 + }, + { + "epoch": 3.763157894736842, + "grad_norm": 0.02116972580552101, + "learning_rate": 2.3236526023942224e-06, + "loss": 0.0001, + "step": 286 + }, + { + "epoch": 3.776315789473684, + "grad_norm": 0.028999928385019302, + "learning_rate": 2.301652963624951e-06, + "loss": 0.0001, + "step": 287 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.05580444633960724, + "learning_rate": 2.2796687710909966e-06, + "loss": 0.0006, + "step": 288 + }, + { + "epoch": 3.8026315789473686, + "grad_norm": 0.03946217522025108, + "learning_rate": 2.2577017368069017e-06, + "loss": 0.0002, + "step": 289 + }, + { + "epoch": 3.8157894736842106, + "grad_norm": 0.01824789121747017, + "learning_rate": 2.235753571451018e-06, + "loss": 0.0001, + "step": 290 + }, + { + "epoch": 3.8289473684210527, + "grad_norm": 0.09996017813682556, + "learning_rate": 2.2138259842322794e-06, + "loss": 0.0002, + "step": 291 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 0.04970015585422516, + "learning_rate": 2.191920682757104e-06, + "loss": 0.0002, + "step": 292 + }, + { + "epoch": 3.8552631578947367, + "grad_norm": 0.05343327671289444, + "learning_rate": 2.170039372896409e-06, + "loss": 0.0003, + "step": 293 + }, + { + "epoch": 3.8684210526315788, + "grad_norm": 0.007754841353744268, + "learning_rate": 2.148183758652774e-06, + "loss": 0.0, + "step": 294 + }, + { + "epoch": 3.8815789473684212, + "grad_norm": 0.06841913610696793, + "learning_rate": 2.126355542027734e-06, + "loss": 0.0002, + "step": 295 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 0.0050100889056921005, + "learning_rate": 2.1045564228892404e-06, + "loss": 0.0, + "step": 296 + }, + { + "epoch": 3.9078947368421053, + "grad_norm": 0.2890152931213379, + "learning_rate": 2.0827880988392856e-06, + "loss": 0.0001, + "step": 297 + }, + { + "epoch": 3.9210526315789473, + "grad_norm": 0.010258257389068604, + "learning_rate": 2.0610522650816985e-06, + "loss": 0.0, + "step": 298 + }, + { + "epoch": 3.9342105263157894, + "grad_norm": 0.031965699046850204, + "learning_rate": 2.0393506142901347e-06, + "loss": 0.0001, + "step": 299 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 0.00890852976590395, + "learning_rate": 2.017684836476258e-06, + "loss": 0.0, + "step": 300 + }, + { + "epoch": 3.9605263157894735, + "grad_norm": 0.030713632702827454, + "learning_rate": 1.9960566188581306e-06, + "loss": 0.0001, + "step": 301 + }, + { + "epoch": 3.973684210526316, + "grad_norm": 0.010088774375617504, + "learning_rate": 1.9744676457288225e-06, + "loss": 0.0, + "step": 302 + }, + { + "epoch": 3.986842105263158, + "grad_norm": 0.006950493901968002, + "learning_rate": 1.952919598325247e-06, + "loss": 0.0, + "step": 303 + }, + { + "epoch": 4.0, + "grad_norm": 0.03781810402870178, + "learning_rate": 1.9314141546972345e-06, + "loss": 0.0003, + "step": 304 + } + ], + "logging_steps": 1, + "max_steps": 456, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 76, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.955687712124882e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-304/training_args.bin b/checkpoint-304/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bd15c797edde13f0f3e0490d0aec249c013df912 --- /dev/null +++ b/checkpoint-304/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ebcaf257fe89a74904f6ea50b526a559eb74a53ebc4dfb373932a4d0fa515f5 +size 7928 diff --git a/checkpoint-304/zero_to_fp32.py b/checkpoint-304/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-304/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-380/README.md b/checkpoint-380/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1b184114a0c28ed3e4c082c18486736dc818166d --- /dev/null +++ b/checkpoint-380/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.3-70B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-380/adapter_config.json b/checkpoint-380/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..763d98264e556bcbc60c63c5b9f70b53c7bbe722 --- /dev/null +++ b/checkpoint-380/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "up_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-380/adapter_model.safetensors b/checkpoint-380/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..94395f80bca67a993544557687bbcd0edff5799f --- /dev/null +++ b/checkpoint-380/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3141131baac97c57ea89187b190eae4ccf3988702aabad982d04b4ab83ff82ac +size 10829849744 diff --git a/checkpoint-380/global_step380/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-380/global_step380/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..03d32413666115d6048ff15cae9fcfed2ed6ce3e --- /dev/null +++ b/checkpoint-380/global_step380/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db72035810cec0d8924e0038c0c8ecaec5f05d8b42a75090257158d04e38ef6b +size 21659418140 diff --git a/checkpoint-380/global_step380/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-380/global_step380/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dac570ca7cdd6987a12027d0617fd33865563d60 --- /dev/null +++ b/checkpoint-380/global_step380/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1920638c8d06725c94a7f04be2e6d33b7070c3b03abfe25250d686b123ed46be +size 21659457372 diff --git a/checkpoint-380/global_step380/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-380/global_step380/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0881dd3f23d22f30fcf3089a12bfe5c7ad0cd38a --- /dev/null +++ b/checkpoint-380/global_step380/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c887c8d1aff85bc382a35114f73ad79fe03817244e33ef121cc1a6e0f47fbdf9 +size 21659417820 diff --git a/checkpoint-380/global_step380/mp_rank_00_model_states.pt b/checkpoint-380/global_step380/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..39c9ffdd3cca3daf35fd0fbd5e3e2cec2676ac8a --- /dev/null +++ b/checkpoint-380/global_step380/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:017462fd854827452c31f671810c8242a5e44134c42ddd44ce9451df06ad0909 +size 11918643933 diff --git a/checkpoint-380/latest b/checkpoint-380/latest new file mode 100644 index 0000000000000000000000000000000000000000..97a28d1c33298568d84d9916417869e8f7800fb7 --- /dev/null +++ b/checkpoint-380/latest @@ -0,0 +1 @@ +global_step380 \ No newline at end of file diff --git a/checkpoint-380/rng_state_0.pth b/checkpoint-380/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..360b0f6a03a87f6aed26f672cfc6136b7bbf1611 --- /dev/null +++ b/checkpoint-380/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fe26d5c64cbc7d621141b185caf2009e21d51970e79374540d8781688adeaf8 +size 14768 diff --git a/checkpoint-380/rng_state_1.pth b/checkpoint-380/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..acb1b5e3308c9e88d4c63b5928d441d53b890547 --- /dev/null +++ b/checkpoint-380/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78c608d9a2f68dfe0957985d05001d8334947b7cfbf16e6d2348f077e306d8cc +size 14768 diff --git a/checkpoint-380/rng_state_2.pth b/checkpoint-380/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f9ceea00dd006196381c120c6c3d96bf762fe79 --- /dev/null +++ b/checkpoint-380/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88d769eb76f77879c1fcd6116be4251a854ae051175d403fb920f7282b89fff9 +size 14768 diff --git a/checkpoint-380/scheduler.pt b/checkpoint-380/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d79447f5e29bdc91f65c41e7c702584c9ad3f146 --- /dev/null +++ b/checkpoint-380/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a93097201a634931518316956d488c8df82e81f7fc29c1bde6ce7bd6033e7827 +size 1064 diff --git a/checkpoint-380/special_tokens_map.json b/checkpoint-380/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-380/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-380/tokenizer.json b/checkpoint-380/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-380/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-380/tokenizer_config.json b/checkpoint-380/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-380/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-380/trainer_state.json b/checkpoint-380/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..eedeb27303058c876dd93da086967789a5d8ae97 --- /dev/null +++ b/checkpoint-380/trainer_state.json @@ -0,0 +1,2693 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 380, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013157894736842105, + "grad_norm": 37.79440689086914, + "learning_rate": 5.0000000000000004e-08, + "loss": 3.1402, + "step": 1 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 38.45823287963867, + "learning_rate": 1.0000000000000001e-07, + "loss": 3.1787, + "step": 2 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 38.25625228881836, + "learning_rate": 1.5000000000000002e-07, + "loss": 3.1316, + "step": 3 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 37.2024040222168, + "learning_rate": 2.0000000000000002e-07, + "loss": 3.1011, + "step": 4 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 38.17294692993164, + "learning_rate": 2.5000000000000004e-07, + "loss": 3.133, + "step": 5 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 37.374794006347656, + "learning_rate": 3.0000000000000004e-07, + "loss": 3.0731, + "step": 6 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 37.226966857910156, + "learning_rate": 3.5000000000000004e-07, + "loss": 3.069, + "step": 7 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 38.40094757080078, + "learning_rate": 4.0000000000000003e-07, + "loss": 3.1223, + "step": 8 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 37.86320877075195, + "learning_rate": 4.5000000000000003e-07, + "loss": 3.062, + "step": 9 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 38.02171325683594, + "learning_rate": 5.000000000000001e-07, + "loss": 3.0008, + "step": 10 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 38.5522346496582, + "learning_rate": 5.5e-07, + "loss": 3.0047, + "step": 11 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 37.72829818725586, + "learning_rate": 6.000000000000001e-07, + "loss": 2.9274, + "step": 12 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 38.488494873046875, + "learning_rate": 6.5e-07, + "loss": 2.8727, + "step": 13 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 38.87471389770508, + "learning_rate": 7.000000000000001e-07, + "loss": 2.8422, + "step": 14 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 37.584896087646484, + "learning_rate": 7.5e-07, + "loss": 2.6728, + "step": 15 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 37.04607391357422, + "learning_rate": 8.000000000000001e-07, + "loss": 2.5215, + "step": 16 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 37.30121994018555, + "learning_rate": 8.500000000000001e-07, + "loss": 2.4689, + "step": 17 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 35.99961853027344, + "learning_rate": 9.000000000000001e-07, + "loss": 2.3, + "step": 18 + }, + { + "epoch": 0.25, + "grad_norm": 35.817543029785156, + "learning_rate": 9.500000000000001e-07, + "loss": 2.1423, + "step": 19 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 35.056915283203125, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.9639, + "step": 20 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 34.83850860595703, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.7845, + "step": 21 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 34.32366943359375, + "learning_rate": 1.1e-06, + "loss": 1.5864, + "step": 22 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 33.79611587524414, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.4011, + "step": 23 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 32.596031188964844, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.195, + "step": 24 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 30.045007705688477, + "learning_rate": 1.25e-06, + "loss": 0.9883, + "step": 25 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 24.89093589782715, + "learning_rate": 1.3e-06, + "loss": 0.7669, + "step": 26 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 23.454408645629883, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.6304, + "step": 27 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 19.837312698364258, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.4717, + "step": 28 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 15.185093879699707, + "learning_rate": 1.45e-06, + "loss": 0.363, + "step": 29 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 9.057796478271484, + "learning_rate": 1.5e-06, + "loss": 0.2439, + "step": 30 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 5.976982593536377, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1864, + "step": 31 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.067375421524048, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1134, + "step": 32 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 2.3589119911193848, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0985, + "step": 33 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 2.0044353008270264, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0859, + "step": 34 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 1.4279972314834595, + "learning_rate": 1.75e-06, + "loss": 0.0728, + "step": 35 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.9807674288749695, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.061, + "step": 36 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 0.906160295009613, + "learning_rate": 1.85e-06, + "loss": 0.0676, + "step": 37 + }, + { + "epoch": 0.5, + "grad_norm": 0.8837690353393555, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0622, + "step": 38 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 0.9579435586929321, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0557, + "step": 39 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.8149510622024536, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0555, + "step": 40 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 0.8899760246276855, + "learning_rate": 2.05e-06, + "loss": 0.0517, + "step": 41 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.6007645130157471, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0518, + "step": 42 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 0.48819127678871155, + "learning_rate": 2.15e-06, + "loss": 0.0429, + "step": 43 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.42939358949661255, + "learning_rate": 2.2e-06, + "loss": 0.0459, + "step": 44 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.5706579685211182, + "learning_rate": 2.25e-06, + "loss": 0.0453, + "step": 45 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.3034597337245941, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0421, + "step": 46 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 0.5601783394813538, + "learning_rate": 2.35e-06, + "loss": 0.0411, + "step": 47 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.35388317704200745, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.04, + "step": 48 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 0.48609891533851624, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.04, + "step": 49 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.4638507068157196, + "learning_rate": 2.5e-06, + "loss": 0.0369, + "step": 50 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 0.5685771703720093, + "learning_rate": 2.55e-06, + "loss": 0.0428, + "step": 51 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.46358174085617065, + "learning_rate": 2.6e-06, + "loss": 0.0483, + "step": 52 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 0.35054436326026917, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0391, + "step": 53 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.3350559175014496, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.039, + "step": 54 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 0.2875112295150757, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0383, + "step": 55 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.4492928683757782, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0358, + "step": 56 + }, + { + "epoch": 0.75, + "grad_norm": 0.29484888911247253, + "learning_rate": 2.85e-06, + "loss": 0.0355, + "step": 57 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.36551928520202637, + "learning_rate": 2.9e-06, + "loss": 0.0403, + "step": 58 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 0.4458053708076477, + "learning_rate": 2.95e-06, + "loss": 0.0342, + "step": 59 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.34047460556030273, + "learning_rate": 3e-06, + "loss": 0.0302, + "step": 60 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 0.3420606255531311, + "learning_rate": 3.05e-06, + "loss": 0.034, + "step": 61 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.3902851939201355, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0327, + "step": 62 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 0.29165828227996826, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0341, + "step": 63 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.40872958302497864, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.035, + "step": 64 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 0.36295783519744873, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0323, + "step": 65 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.3857724368572235, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0336, + "step": 66 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 0.3207017481327057, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0332, + "step": 67 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.2903987169265747, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0327, + "step": 68 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 0.3386954963207245, + "learning_rate": 3.45e-06, + "loss": 0.0308, + "step": 69 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.4339621365070343, + "learning_rate": 3.5e-06, + "loss": 0.0361, + "step": 70 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 0.28095564246177673, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0306, + "step": 71 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.4141469895839691, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.028, + "step": 72 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 0.35212820768356323, + "learning_rate": 3.65e-06, + "loss": 0.032, + "step": 73 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.26956063508987427, + "learning_rate": 3.7e-06, + "loss": 0.0294, + "step": 74 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 0.32735681533813477, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0272, + "step": 75 + }, + { + "epoch": 1.0, + "grad_norm": 0.4906782805919647, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0324, + "step": 76 + }, + { + "epoch": 1.013157894736842, + "grad_norm": 0.3451901078224182, + "learning_rate": 3.85e-06, + "loss": 0.0288, + "step": 77 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 0.30598726868629456, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0305, + "step": 78 + }, + { + "epoch": 1.0394736842105263, + "grad_norm": 0.31189921498298645, + "learning_rate": 3.95e-06, + "loss": 0.0274, + "step": 79 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.31895947456359863, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0236, + "step": 80 + }, + { + "epoch": 1.0657894736842106, + "grad_norm": 0.3290308117866516, + "learning_rate": 4.05e-06, + "loss": 0.0284, + "step": 81 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 0.3651576638221741, + "learning_rate": 4.1e-06, + "loss": 0.0274, + "step": 82 + }, + { + "epoch": 1.0921052631578947, + "grad_norm": 0.2393084615468979, + "learning_rate": 4.15e-06, + "loss": 0.0301, + "step": 83 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 0.333898663520813, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0235, + "step": 84 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 0.3287582993507385, + "learning_rate": 4.25e-06, + "loss": 0.0248, + "step": 85 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 0.3432455360889435, + "learning_rate": 4.3e-06, + "loss": 0.026, + "step": 86 + }, + { + "epoch": 1.1447368421052633, + "grad_norm": 0.3176783621311188, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0249, + "step": 87 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.33373433351516724, + "learning_rate": 4.4e-06, + "loss": 0.0251, + "step": 88 + }, + { + "epoch": 1.1710526315789473, + "grad_norm": 0.36087968945503235, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0251, + "step": 89 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.3681696057319641, + "learning_rate": 4.5e-06, + "loss": 0.0276, + "step": 90 + }, + { + "epoch": 1.1973684210526316, + "grad_norm": 0.46539774537086487, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0229, + "step": 91 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.23368288576602936, + "learning_rate": 4.600000000000001e-06, + "loss": 0.021, + "step": 92 + }, + { + "epoch": 1.2236842105263157, + "grad_norm": 0.26623716950416565, + "learning_rate": 4.65e-06, + "loss": 0.0265, + "step": 93 + }, + { + "epoch": 1.236842105263158, + "grad_norm": 0.28750717639923096, + "learning_rate": 4.7e-06, + "loss": 0.0221, + "step": 94 + }, + { + "epoch": 1.25, + "grad_norm": 0.46578383445739746, + "learning_rate": 4.75e-06, + "loss": 0.0236, + "step": 95 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.33406543731689453, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0239, + "step": 96 + }, + { + "epoch": 1.2763157894736843, + "grad_norm": 0.21247217059135437, + "learning_rate": 4.85e-06, + "loss": 0.0188, + "step": 97 + }, + { + "epoch": 1.2894736842105263, + "grad_norm": 0.26229164004325867, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.022, + "step": 98 + }, + { + "epoch": 1.3026315789473684, + "grad_norm": 0.2967258393764496, + "learning_rate": 4.95e-06, + "loss": 0.0218, + "step": 99 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.419189453125, + "learning_rate": 5e-06, + "loss": 0.0247, + "step": 100 + }, + { + "epoch": 1.3289473684210527, + "grad_norm": 0.25418952107429504, + "learning_rate": 4.999902656502973e-06, + "loss": 0.0223, + "step": 101 + }, + { + "epoch": 1.3421052631578947, + "grad_norm": 0.20174147188663483, + "learning_rate": 4.9996106335924965e-06, + "loss": 0.0266, + "step": 102 + }, + { + "epoch": 1.3552631578947367, + "grad_norm": 0.21732494235038757, + "learning_rate": 4.999123954009797e-06, + "loss": 0.0188, + "step": 103 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.2683119773864746, + "learning_rate": 4.998442655654946e-06, + "loss": 0.0203, + "step": 104 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 0.18175765872001648, + "learning_rate": 4.997566791583916e-06, + "loss": 0.0185, + "step": 105 + }, + { + "epoch": 1.3947368421052633, + "grad_norm": 0.3932501971721649, + "learning_rate": 4.996496430004446e-06, + "loss": 0.0238, + "step": 106 + }, + { + "epoch": 1.4078947368421053, + "grad_norm": 0.31145599484443665, + "learning_rate": 4.995231654270726e-06, + "loss": 0.0199, + "step": 107 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.41356661915779114, + "learning_rate": 4.993772562876909e-06, + "loss": 0.0187, + "step": 108 + }, + { + "epoch": 1.4342105263157894, + "grad_norm": 0.22484919428825378, + "learning_rate": 4.992119269449445e-06, + "loss": 0.0182, + "step": 109 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.28703081607818604, + "learning_rate": 4.990271902738223e-06, + "loss": 0.0239, + "step": 110 + }, + { + "epoch": 1.4605263157894737, + "grad_norm": 0.2394670695066452, + "learning_rate": 4.988230606606552e-06, + "loss": 0.0171, + "step": 111 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.3552885949611664, + "learning_rate": 4.985995540019956e-06, + "loss": 0.0226, + "step": 112 + }, + { + "epoch": 1.486842105263158, + "grad_norm": 0.24968908727169037, + "learning_rate": 4.983566877033791e-06, + "loss": 0.0193, + "step": 113 + }, + { + "epoch": 1.5, + "grad_norm": 0.24420695006847382, + "learning_rate": 4.980944806779698e-06, + "loss": 0.0226, + "step": 114 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 0.34696799516677856, + "learning_rate": 4.9781295334508664e-06, + "loss": 0.02, + "step": 115 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.23682132363319397, + "learning_rate": 4.975121276286136e-06, + "loss": 0.0194, + "step": 116 + }, + { + "epoch": 1.5394736842105263, + "grad_norm": 0.2485751509666443, + "learning_rate": 4.9719202695529265e-06, + "loss": 0.0149, + "step": 117 + }, + { + "epoch": 1.5526315789473686, + "grad_norm": 0.2815033495426178, + "learning_rate": 4.968526762528988e-06, + "loss": 0.0153, + "step": 118 + }, + { + "epoch": 1.5657894736842106, + "grad_norm": 0.24127744138240814, + "learning_rate": 4.964941019482995e-06, + "loss": 0.019, + "step": 119 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.2987695038318634, + "learning_rate": 4.961163319653959e-06, + "loss": 0.0165, + "step": 120 + }, + { + "epoch": 1.5921052631578947, + "grad_norm": 0.33492133021354675, + "learning_rate": 4.9571939572294914e-06, + "loss": 0.0185, + "step": 121 + }, + { + "epoch": 1.6052631578947367, + "grad_norm": 0.20466521382331848, + "learning_rate": 4.953033241322887e-06, + "loss": 0.0151, + "step": 122 + }, + { + "epoch": 1.618421052631579, + "grad_norm": 0.36396247148513794, + "learning_rate": 4.948681495949055e-06, + "loss": 0.0138, + "step": 123 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.2000381350517273, + "learning_rate": 4.944139059999286e-06, + "loss": 0.0125, + "step": 124 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 0.24977952241897583, + "learning_rate": 4.939406287214861e-06, + "loss": 0.0152, + "step": 125 + }, + { + "epoch": 1.6578947368421053, + "grad_norm": 0.26705336570739746, + "learning_rate": 4.9344835461595016e-06, + "loss": 0.0148, + "step": 126 + }, + { + "epoch": 1.6710526315789473, + "grad_norm": 0.26699599623680115, + "learning_rate": 4.929371220190671e-06, + "loss": 0.0151, + "step": 127 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.20149633288383484, + "learning_rate": 4.9240697074297205e-06, + "loss": 0.0151, + "step": 128 + }, + { + "epoch": 1.6973684210526314, + "grad_norm": 0.1961003988981247, + "learning_rate": 4.918579420730884e-06, + "loss": 0.0163, + "step": 129 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.2148503214120865, + "learning_rate": 4.912900787649124e-06, + "loss": 0.0137, + "step": 130 + }, + { + "epoch": 1.723684210526316, + "grad_norm": 0.20505128800868988, + "learning_rate": 4.907034250406846e-06, + "loss": 0.0136, + "step": 131 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.19462467730045319, + "learning_rate": 4.900980265859449e-06, + "loss": 0.0139, + "step": 132 + }, + { + "epoch": 1.75, + "grad_norm": 0.21602794528007507, + "learning_rate": 4.894739305459754e-06, + "loss": 0.015, + "step": 133 + }, + { + "epoch": 1.763157894736842, + "grad_norm": 0.22933153808116913, + "learning_rate": 4.88831185522129e-06, + "loss": 0.0142, + "step": 134 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 0.1785646229982376, + "learning_rate": 4.881698415680442e-06, + "loss": 0.0097, + "step": 135 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.21535581350326538, + "learning_rate": 4.874899501857477e-06, + "loss": 0.0106, + "step": 136 + }, + { + "epoch": 1.8026315789473686, + "grad_norm": 0.2360723614692688, + "learning_rate": 4.867915643216434e-06, + "loss": 0.0123, + "step": 137 + }, + { + "epoch": 1.8157894736842106, + "grad_norm": 0.18098825216293335, + "learning_rate": 4.860747383623889e-06, + "loss": 0.0126, + "step": 138 + }, + { + "epoch": 1.8289473684210527, + "grad_norm": 0.1836131066083908, + "learning_rate": 4.85339528130661e-06, + "loss": 0.0125, + "step": 139 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.34765973687171936, + "learning_rate": 4.845859908808074e-06, + "loss": 0.0158, + "step": 140 + }, + { + "epoch": 1.8552631578947367, + "grad_norm": 0.22595159709453583, + "learning_rate": 4.838141852943891e-06, + "loss": 0.0101, + "step": 141 + }, + { + "epoch": 1.868421052631579, + "grad_norm": 0.2811257243156433, + "learning_rate": 4.830241714756099e-06, + "loss": 0.0111, + "step": 142 + }, + { + "epoch": 1.881578947368421, + "grad_norm": 0.1875840127468109, + "learning_rate": 4.822160109466361e-06, + "loss": 0.0086, + "step": 143 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.19390800595283508, + "learning_rate": 4.813897666428054e-06, + "loss": 0.0106, + "step": 144 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 0.3725268244743347, + "learning_rate": 4.805455029077255e-06, + "loss": 0.0095, + "step": 145 + }, + { + "epoch": 1.9210526315789473, + "grad_norm": 0.2201736867427826, + "learning_rate": 4.79683285488264e-06, + "loss": 0.0074, + "step": 146 + }, + { + "epoch": 1.9342105263157894, + "grad_norm": 0.17423805594444275, + "learning_rate": 4.788031815294282e-06, + "loss": 0.0072, + "step": 147 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 0.22169643640518188, + "learning_rate": 4.779052595691355e-06, + "loss": 0.0121, + "step": 148 + }, + { + "epoch": 1.9605263157894737, + "grad_norm": 0.3247295618057251, + "learning_rate": 4.76989589532877e-06, + "loss": 0.0121, + "step": 149 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.1830369532108307, + "learning_rate": 4.7605624272827125e-06, + "loss": 0.0077, + "step": 150 + }, + { + "epoch": 1.986842105263158, + "grad_norm": 0.2967239022254944, + "learning_rate": 4.75105291839512e-06, + "loss": 0.0104, + "step": 151 + }, + { + "epoch": 2.0, + "grad_norm": 0.17589347064495087, + "learning_rate": 4.741368109217072e-06, + "loss": 0.0075, + "step": 152 + }, + { + "epoch": 2.013157894736842, + "grad_norm": 0.15554101765155792, + "learning_rate": 4.7315087539511225e-06, + "loss": 0.0063, + "step": 153 + }, + { + "epoch": 2.026315789473684, + "grad_norm": 0.13191422820091248, + "learning_rate": 4.721475620392567e-06, + "loss": 0.0039, + "step": 154 + }, + { + "epoch": 2.039473684210526, + "grad_norm": 0.1909502148628235, + "learning_rate": 4.711269489869654e-06, + "loss": 0.0055, + "step": 155 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 0.16942323744297028, + "learning_rate": 4.700891157182729e-06, + "loss": 0.0055, + "step": 156 + }, + { + "epoch": 2.0657894736842106, + "grad_norm": 0.1740521341562271, + "learning_rate": 4.690341430542351e-06, + "loss": 0.006, + "step": 157 + }, + { + "epoch": 2.0789473684210527, + "grad_norm": 0.19565710425376892, + "learning_rate": 4.679621131506347e-06, + "loss": 0.0057, + "step": 158 + }, + { + "epoch": 2.0921052631578947, + "grad_norm": 0.13488221168518066, + "learning_rate": 4.668731094915835e-06, + "loss": 0.0033, + "step": 159 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.16322746872901917, + "learning_rate": 4.657672168830211e-06, + "loss": 0.0042, + "step": 160 + }, + { + "epoch": 2.1184210526315788, + "grad_norm": 0.2087877243757248, + "learning_rate": 4.646445214461105e-06, + "loss": 0.0052, + "step": 161 + }, + { + "epoch": 2.1315789473684212, + "grad_norm": 0.12736408412456512, + "learning_rate": 4.635051106105316e-06, + "loss": 0.0051, + "step": 162 + }, + { + "epoch": 2.1447368421052633, + "grad_norm": 0.13264045119285583, + "learning_rate": 4.623490731076728e-06, + "loss": 0.0035, + "step": 163 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.2015363723039627, + "learning_rate": 4.6117649896372055e-06, + "loss": 0.0055, + "step": 164 + }, + { + "epoch": 2.1710526315789473, + "grad_norm": 0.21640510857105255, + "learning_rate": 4.59987479492649e-06, + "loss": 0.008, + "step": 165 + }, + { + "epoch": 2.1842105263157894, + "grad_norm": 0.17276327311992645, + "learning_rate": 4.587821072891089e-06, + "loss": 0.0058, + "step": 166 + }, + { + "epoch": 2.1973684210526314, + "grad_norm": 0.15923018753528595, + "learning_rate": 4.5756047622121665e-06, + "loss": 0.0039, + "step": 167 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.14791631698608398, + "learning_rate": 4.563226814232444e-06, + "loss": 0.0032, + "step": 168 + }, + { + "epoch": 2.223684210526316, + "grad_norm": 0.16776816546916962, + "learning_rate": 4.550688192882115e-06, + "loss": 0.0043, + "step": 169 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.12374848127365112, + "learning_rate": 4.53798987460378e-06, + "loss": 0.0035, + "step": 170 + }, + { + "epoch": 2.25, + "grad_norm": 0.13051433861255646, + "learning_rate": 4.525132848276405e-06, + "loss": 0.0036, + "step": 171 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 0.12607790529727936, + "learning_rate": 4.512118115138315e-06, + "loss": 0.0052, + "step": 172 + }, + { + "epoch": 2.276315789473684, + "grad_norm": 0.09630817174911499, + "learning_rate": 4.498946688709216e-06, + "loss": 0.0031, + "step": 173 + }, + { + "epoch": 2.2894736842105265, + "grad_norm": 0.11332327872514725, + "learning_rate": 4.485619594711278e-06, + "loss": 0.0043, + "step": 174 + }, + { + "epoch": 2.3026315789473686, + "grad_norm": 0.16632875800132751, + "learning_rate": 4.4721378709892475e-06, + "loss": 0.005, + "step": 175 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.12856662273406982, + "learning_rate": 4.4585025674296315e-06, + "loss": 0.0031, + "step": 176 + }, + { + "epoch": 2.3289473684210527, + "grad_norm": 0.197174072265625, + "learning_rate": 4.444714745878936e-06, + "loss": 0.0045, + "step": 177 + }, + { + "epoch": 2.3421052631578947, + "grad_norm": 0.17151176929473877, + "learning_rate": 4.430775480060973e-06, + "loss": 0.0044, + "step": 178 + }, + { + "epoch": 2.3552631578947367, + "grad_norm": 0.14734052121639252, + "learning_rate": 4.416685855493246e-06, + "loss": 0.0053, + "step": 179 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.13286560773849487, + "learning_rate": 4.4024469694024194e-06, + "loss": 0.0039, + "step": 180 + }, + { + "epoch": 2.3815789473684212, + "grad_norm": 0.1636727899312973, + "learning_rate": 4.388059930638865e-06, + "loss": 0.0039, + "step": 181 + }, + { + "epoch": 2.3947368421052633, + "grad_norm": 0.1082785576581955, + "learning_rate": 4.373525859590313e-06, + "loss": 0.0025, + "step": 182 + }, + { + "epoch": 2.4078947368421053, + "grad_norm": 0.1716354638338089, + "learning_rate": 4.358845888094607e-06, + "loss": 0.004, + "step": 183 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.14045757055282593, + "learning_rate": 4.3440211593515556e-06, + "loss": 0.0026, + "step": 184 + }, + { + "epoch": 2.4342105263157894, + "grad_norm": 0.1682705134153366, + "learning_rate": 4.32905282783391e-06, + "loss": 0.0042, + "step": 185 + }, + { + "epoch": 2.4473684210526314, + "grad_norm": 0.11872018873691559, + "learning_rate": 4.313942059197457e-06, + "loss": 0.0028, + "step": 186 + }, + { + "epoch": 2.4605263157894735, + "grad_norm": 0.12182936072349548, + "learning_rate": 4.298690030190247e-06, + "loss": 0.0018, + "step": 187 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 0.2031281590461731, + "learning_rate": 4.283297928560951e-06, + "loss": 0.0032, + "step": 188 + }, + { + "epoch": 2.486842105263158, + "grad_norm": 0.0959291160106659, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0015, + "step": 189 + }, + { + "epoch": 2.5, + "grad_norm": 0.15291978418827057, + "learning_rate": 4.252098312878083e-06, + "loss": 0.0036, + "step": 190 + }, + { + "epoch": 2.513157894736842, + "grad_norm": 0.15930163860321045, + "learning_rate": 4.236293228488267e-06, + "loss": 0.0047, + "step": 191 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.2150997817516327, + "learning_rate": 4.220352930614672e-06, + "loss": 0.0038, + "step": 192 + }, + { + "epoch": 2.5394736842105265, + "grad_norm": 0.1317511945962906, + "learning_rate": 4.204278660604767e-06, + "loss": 0.0032, + "step": 193 + }, + { + "epoch": 2.5526315789473686, + "grad_norm": 0.07808093726634979, + "learning_rate": 4.1880716702390764e-06, + "loss": 0.0011, + "step": 194 + }, + { + "epoch": 2.5657894736842106, + "grad_norm": 0.13284094631671906, + "learning_rate": 4.171733221633695e-06, + "loss": 0.0037, + "step": 195 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 0.16264718770980835, + "learning_rate": 4.155264587142002e-06, + "loss": 0.0039, + "step": 196 + }, + { + "epoch": 2.5921052631578947, + "grad_norm": 0.10431212931871414, + "learning_rate": 4.138667049255574e-06, + "loss": 0.0023, + "step": 197 + }, + { + "epoch": 2.6052631578947367, + "grad_norm": 0.08813079446554184, + "learning_rate": 4.121941900504316e-06, + "loss": 0.0018, + "step": 198 + }, + { + "epoch": 2.6184210526315788, + "grad_norm": 0.22164294123649597, + "learning_rate": 4.105090443355801e-06, + "loss": 0.0037, + "step": 199 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.09111231565475464, + "learning_rate": 4.088113990113846e-06, + "loss": 0.0019, + "step": 200 + }, + { + "epoch": 2.6447368421052633, + "grad_norm": 0.0871724933385849, + "learning_rate": 4.071013862816311e-06, + "loss": 0.0014, + "step": 201 + }, + { + "epoch": 2.6578947368421053, + "grad_norm": 0.2138734757900238, + "learning_rate": 4.0537913931321495e-06, + "loss": 0.0022, + "step": 202 + }, + { + "epoch": 2.6710526315789473, + "grad_norm": 0.11238733679056168, + "learning_rate": 4.036447922257699e-06, + "loss": 0.0023, + "step": 203 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.0815015360713005, + "learning_rate": 4.018984800812248e-06, + "loss": 0.0011, + "step": 204 + }, + { + "epoch": 2.6973684210526314, + "grad_norm": 0.304352343082428, + "learning_rate": 4.001403388732842e-06, + "loss": 0.003, + "step": 205 + }, + { + "epoch": 2.7105263157894735, + "grad_norm": 0.10469458252191544, + "learning_rate": 3.983705055168391e-06, + "loss": 0.0009, + "step": 206 + }, + { + "epoch": 2.723684210526316, + "grad_norm": 0.1440751701593399, + "learning_rate": 3.965891178373038e-06, + "loss": 0.0025, + "step": 207 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.2173687070608139, + "learning_rate": 3.947963145598833e-06, + "loss": 0.0028, + "step": 208 + }, + { + "epoch": 2.75, + "grad_norm": 0.2922506332397461, + "learning_rate": 3.929922352987702e-06, + "loss": 0.003, + "step": 209 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.18853916227817535, + "learning_rate": 3.911770205462717e-06, + "loss": 0.0014, + "step": 210 + }, + { + "epoch": 2.776315789473684, + "grad_norm": 0.12060266733169556, + "learning_rate": 3.8935081166186935e-06, + "loss": 0.0015, + "step": 211 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 0.14512351155281067, + "learning_rate": 3.875137508612104e-06, + "loss": 0.002, + "step": 212 + }, + { + "epoch": 2.8026315789473686, + "grad_norm": 0.15343990921974182, + "learning_rate": 3.856659812050328e-06, + "loss": 0.0012, + "step": 213 + }, + { + "epoch": 2.8157894736842106, + "grad_norm": 0.09639029949903488, + "learning_rate": 3.838076465880248e-06, + "loss": 0.0017, + "step": 214 + }, + { + "epoch": 2.8289473684210527, + "grad_norm": 0.09907295554876328, + "learning_rate": 3.819388917276186e-06, + "loss": 0.0012, + "step": 215 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.05898207053542137, + "learning_rate": 3.8005986215272056e-06, + "loss": 0.0006, + "step": 216 + }, + { + "epoch": 2.8552631578947367, + "grad_norm": 0.10509718954563141, + "learning_rate": 3.7817070419237866e-06, + "loss": 0.0013, + "step": 217 + }, + { + "epoch": 2.8684210526315788, + "grad_norm": 0.17495931684970856, + "learning_rate": 3.7627156496438686e-06, + "loss": 0.001, + "step": 218 + }, + { + "epoch": 2.8815789473684212, + "grad_norm": 0.12321923673152924, + "learning_rate": 3.7436259236382797e-06, + "loss": 0.001, + "step": 219 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.11147578060626984, + "learning_rate": 3.7244393505155713e-06, + "loss": 0.0015, + "step": 220 + }, + { + "epoch": 2.9078947368421053, + "grad_norm": 0.06215621903538704, + "learning_rate": 3.7051574244262412e-06, + "loss": 0.0006, + "step": 221 + }, + { + "epoch": 2.9210526315789473, + "grad_norm": 0.03004705347120762, + "learning_rate": 3.6857816469463806e-06, + "loss": 0.0002, + "step": 222 + }, + { + "epoch": 2.9342105263157894, + "grad_norm": 0.09312062710523605, + "learning_rate": 3.6663135269607413e-06, + "loss": 0.0014, + "step": 223 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.05324690416455269, + "learning_rate": 3.6467545805452266e-06, + "loss": 0.0005, + "step": 224 + }, + { + "epoch": 2.9605263157894735, + "grad_norm": 0.06438126415014267, + "learning_rate": 3.6271063308488298e-06, + "loss": 0.0005, + "step": 225 + }, + { + "epoch": 2.973684210526316, + "grad_norm": 0.08646634221076965, + "learning_rate": 3.6073703079750204e-06, + "loss": 0.0006, + "step": 226 + }, + { + "epoch": 2.986842105263158, + "grad_norm": 0.05682829022407532, + "learning_rate": 3.5875480488625847e-06, + "loss": 0.0006, + "step": 227 + }, + { + "epoch": 3.0, + "grad_norm": 0.11831886321306229, + "learning_rate": 3.5676410971659404e-06, + "loss": 0.0006, + "step": 228 + }, + { + "epoch": 3.013157894736842, + "grad_norm": 0.0533737950026989, + "learning_rate": 3.547651003134921e-06, + "loss": 0.0003, + "step": 229 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 0.06704334169626236, + "learning_rate": 3.527579323494055e-06, + "loss": 0.0005, + "step": 230 + }, + { + "epoch": 3.039473684210526, + "grad_norm": 0.024390004575252533, + "learning_rate": 3.507427621321331e-06, + "loss": 0.0002, + "step": 231 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 0.10754281282424927, + "learning_rate": 3.4871974659264786e-06, + "loss": 0.0009, + "step": 232 + }, + { + "epoch": 3.0657894736842106, + "grad_norm": 0.032474737614393234, + "learning_rate": 3.466890432728754e-06, + "loss": 0.0002, + "step": 233 + }, + { + "epoch": 3.0789473684210527, + "grad_norm": 0.11489477753639221, + "learning_rate": 3.446508103134259e-06, + "loss": 0.0008, + "step": 234 + }, + { + "epoch": 3.0921052631578947, + "grad_norm": 0.11805123090744019, + "learning_rate": 3.426052064412785e-06, + "loss": 0.0013, + "step": 235 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 0.04284543916583061, + "learning_rate": 3.4055239095742067e-06, + "loss": 0.0004, + "step": 236 + }, + { + "epoch": 3.1184210526315788, + "grad_norm": 0.0592227578163147, + "learning_rate": 3.3849252372444295e-06, + "loss": 0.0005, + "step": 237 + }, + { + "epoch": 3.1315789473684212, + "grad_norm": 0.08888686448335648, + "learning_rate": 3.364257651540891e-06, + "loss": 0.0007, + "step": 238 + }, + { + "epoch": 3.1447368421052633, + "grad_norm": 0.04613477736711502, + "learning_rate": 3.343522761947646e-06, + "loss": 0.0004, + "step": 239 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.024109596386551857, + "learning_rate": 3.322722183190025e-06, + "loss": 0.0002, + "step": 240 + }, + { + "epoch": 3.1710526315789473, + "grad_norm": 0.04759601503610611, + "learning_rate": 3.3018575351088894e-06, + "loss": 0.0002, + "step": 241 + }, + { + "epoch": 3.1842105263157894, + "grad_norm": 0.06583128869533539, + "learning_rate": 3.280930442534486e-06, + "loss": 0.0003, + "step": 242 + }, + { + "epoch": 3.1973684210526314, + "grad_norm": 0.03406457230448723, + "learning_rate": 3.2599425351599136e-06, + "loss": 0.0001, + "step": 243 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 0.020209595561027527, + "learning_rate": 3.238895447414211e-06, + "loss": 0.0002, + "step": 244 + }, + { + "epoch": 3.223684210526316, + "grad_norm": 0.04148108884692192, + "learning_rate": 3.217790818335077e-06, + "loss": 0.0002, + "step": 245 + }, + { + "epoch": 3.236842105263158, + "grad_norm": 0.05275535210967064, + "learning_rate": 3.196630291441231e-06, + "loss": 0.0003, + "step": 246 + }, + { + "epoch": 3.25, + "grad_norm": 0.03692334517836571, + "learning_rate": 3.175415514604422e-06, + "loss": 0.0004, + "step": 247 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 0.06586624681949615, + "learning_rate": 3.154148139921102e-06, + "loss": 0.0004, + "step": 248 + }, + { + "epoch": 3.276315789473684, + "grad_norm": 0.06966730207204819, + "learning_rate": 3.132829823583771e-06, + "loss": 0.0003, + "step": 249 + }, + { + "epoch": 3.2894736842105265, + "grad_norm": 0.10422863811254501, + "learning_rate": 3.1114622257520004e-06, + "loss": 0.0004, + "step": 250 + }, + { + "epoch": 3.3026315789473686, + "grad_norm": 0.10132399946451187, + "learning_rate": 3.0900470104231456e-06, + "loss": 0.0008, + "step": 251 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 0.06418923288583755, + "learning_rate": 3.0685858453027668e-06, + "loss": 0.0004, + "step": 252 + }, + { + "epoch": 3.3289473684210527, + "grad_norm": 0.1488313227891922, + "learning_rate": 3.047080401674754e-06, + "loss": 0.0021, + "step": 253 + }, + { + "epoch": 3.3421052631578947, + "grad_norm": 0.05905044823884964, + "learning_rate": 3.0255323542711784e-06, + "loss": 0.0006, + "step": 254 + }, + { + "epoch": 3.3552631578947367, + "grad_norm": 0.05011991411447525, + "learning_rate": 3.00394338114187e-06, + "loss": 0.0004, + "step": 255 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.026315532624721527, + "learning_rate": 2.9823151635237424e-06, + "loss": 0.0002, + "step": 256 + }, + { + "epoch": 3.3815789473684212, + "grad_norm": 0.031153831630945206, + "learning_rate": 2.9606493857098657e-06, + "loss": 0.0003, + "step": 257 + }, + { + "epoch": 3.3947368421052633, + "grad_norm": 0.019811732694506645, + "learning_rate": 2.938947734918302e-06, + "loss": 0.0001, + "step": 258 + }, + { + "epoch": 3.4078947368421053, + "grad_norm": 0.029009979218244553, + "learning_rate": 2.9172119011607153e-06, + "loss": 0.0002, + "step": 259 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 0.08594539761543274, + "learning_rate": 2.8954435771107604e-06, + "loss": 0.0003, + "step": 260 + }, + { + "epoch": 3.4342105263157894, + "grad_norm": 0.07609947770833969, + "learning_rate": 2.8736444579722665e-06, + "loss": 0.0006, + "step": 261 + }, + { + "epoch": 3.4473684210526314, + "grad_norm": 0.052105486392974854, + "learning_rate": 2.8518162413472266e-06, + "loss": 0.0003, + "step": 262 + }, + { + "epoch": 3.4605263157894735, + "grad_norm": 0.023044012486934662, + "learning_rate": 2.8299606271035913e-06, + "loss": 0.0001, + "step": 263 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 0.01714818924665451, + "learning_rate": 2.8080793172428965e-06, + "loss": 0.0001, + "step": 264 + }, + { + "epoch": 3.486842105263158, + "grad_norm": 0.024353889748454094, + "learning_rate": 2.786174015767721e-06, + "loss": 0.0002, + "step": 265 + }, + { + "epoch": 3.5, + "grad_norm": 0.044456806033849716, + "learning_rate": 2.764246428548983e-06, + "loss": 0.0004, + "step": 266 + }, + { + "epoch": 3.513157894736842, + "grad_norm": 0.06826099753379822, + "learning_rate": 2.742298263193099e-06, + "loss": 0.0005, + "step": 267 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 0.2765248417854309, + "learning_rate": 2.720331228909005e-06, + "loss": 0.0005, + "step": 268 + }, + { + "epoch": 3.5394736842105265, + "grad_norm": 0.04589018225669861, + "learning_rate": 2.6983470363750497e-06, + "loss": 0.0004, + "step": 269 + }, + { + "epoch": 3.5526315789473686, + "grad_norm": 0.023166710510849953, + "learning_rate": 2.6763473976057776e-06, + "loss": 0.0001, + "step": 270 + }, + { + "epoch": 3.5657894736842106, + "grad_norm": 0.03657109662890434, + "learning_rate": 2.6543340258186063e-06, + "loss": 0.0002, + "step": 271 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.08881364017724991, + "learning_rate": 2.6323086353004077e-06, + "loss": 0.0004, + "step": 272 + }, + { + "epoch": 3.5921052631578947, + "grad_norm": 0.022605225443840027, + "learning_rate": 2.610272941274012e-06, + "loss": 0.0001, + "step": 273 + }, + { + "epoch": 3.6052631578947367, + "grad_norm": 0.05161530151963234, + "learning_rate": 2.588228659764632e-06, + "loss": 0.0002, + "step": 274 + }, + { + "epoch": 3.6184210526315788, + "grad_norm": 0.039631813764572144, + "learning_rate": 2.5661775074662276e-06, + "loss": 0.0003, + "step": 275 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 0.03075975738465786, + "learning_rate": 2.544121201607822e-06, + "loss": 0.0001, + "step": 276 + }, + { + "epoch": 3.6447368421052633, + "grad_norm": 0.04068103805184364, + "learning_rate": 2.5220614598197708e-06, + "loss": 0.0001, + "step": 277 + }, + { + "epoch": 3.6578947368421053, + "grad_norm": 0.08374299108982086, + "learning_rate": 2.5e-06, + "loss": 0.0006, + "step": 278 + }, + { + "epoch": 3.6710526315789473, + "grad_norm": 0.014881132170557976, + "learning_rate": 2.477938540180231e-06, + "loss": 0.0001, + "step": 279 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 0.08170844614505768, + "learning_rate": 2.455878798392179e-06, + "loss": 0.0004, + "step": 280 + }, + { + "epoch": 3.6973684210526314, + "grad_norm": 0.02605108916759491, + "learning_rate": 2.433822492533774e-06, + "loss": 0.0001, + "step": 281 + }, + { + "epoch": 3.7105263157894735, + "grad_norm": 0.056749701499938965, + "learning_rate": 2.411771340235369e-06, + "loss": 0.0008, + "step": 282 + }, + { + "epoch": 3.723684210526316, + "grad_norm": 0.043280281126499176, + "learning_rate": 2.389727058725989e-06, + "loss": 0.0002, + "step": 283 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 0.01662975363433361, + "learning_rate": 2.3676913646995923e-06, + "loss": 0.0001, + "step": 284 + }, + { + "epoch": 3.75, + "grad_norm": 0.045024238526821136, + "learning_rate": 2.3456659741813945e-06, + "loss": 0.0001, + "step": 285 + }, + { + "epoch": 3.763157894736842, + "grad_norm": 0.02116972580552101, + "learning_rate": 2.3236526023942224e-06, + "loss": 0.0001, + "step": 286 + }, + { + "epoch": 3.776315789473684, + "grad_norm": 0.028999928385019302, + "learning_rate": 2.301652963624951e-06, + "loss": 0.0001, + "step": 287 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.05580444633960724, + "learning_rate": 2.2796687710909966e-06, + "loss": 0.0006, + "step": 288 + }, + { + "epoch": 3.8026315789473686, + "grad_norm": 0.03946217522025108, + "learning_rate": 2.2577017368069017e-06, + "loss": 0.0002, + "step": 289 + }, + { + "epoch": 3.8157894736842106, + "grad_norm": 0.01824789121747017, + "learning_rate": 2.235753571451018e-06, + "loss": 0.0001, + "step": 290 + }, + { + "epoch": 3.8289473684210527, + "grad_norm": 0.09996017813682556, + "learning_rate": 2.2138259842322794e-06, + "loss": 0.0002, + "step": 291 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 0.04970015585422516, + "learning_rate": 2.191920682757104e-06, + "loss": 0.0002, + "step": 292 + }, + { + "epoch": 3.8552631578947367, + "grad_norm": 0.05343327671289444, + "learning_rate": 2.170039372896409e-06, + "loss": 0.0003, + "step": 293 + }, + { + "epoch": 3.8684210526315788, + "grad_norm": 0.007754841353744268, + "learning_rate": 2.148183758652774e-06, + "loss": 0.0, + "step": 294 + }, + { + "epoch": 3.8815789473684212, + "grad_norm": 0.06841913610696793, + "learning_rate": 2.126355542027734e-06, + "loss": 0.0002, + "step": 295 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 0.0050100889056921005, + "learning_rate": 2.1045564228892404e-06, + "loss": 0.0, + "step": 296 + }, + { + "epoch": 3.9078947368421053, + "grad_norm": 0.2890152931213379, + "learning_rate": 2.0827880988392856e-06, + "loss": 0.0001, + "step": 297 + }, + { + "epoch": 3.9210526315789473, + "grad_norm": 0.010258257389068604, + "learning_rate": 2.0610522650816985e-06, + "loss": 0.0, + "step": 298 + }, + { + "epoch": 3.9342105263157894, + "grad_norm": 0.031965699046850204, + "learning_rate": 2.0393506142901347e-06, + "loss": 0.0001, + "step": 299 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 0.00890852976590395, + "learning_rate": 2.017684836476258e-06, + "loss": 0.0, + "step": 300 + }, + { + "epoch": 3.9605263157894735, + "grad_norm": 0.030713632702827454, + "learning_rate": 1.9960566188581306e-06, + "loss": 0.0001, + "step": 301 + }, + { + "epoch": 3.973684210526316, + "grad_norm": 0.010088774375617504, + "learning_rate": 1.9744676457288225e-06, + "loss": 0.0, + "step": 302 + }, + { + "epoch": 3.986842105263158, + "grad_norm": 0.006950493901968002, + "learning_rate": 1.952919598325247e-06, + "loss": 0.0, + "step": 303 + }, + { + "epoch": 4.0, + "grad_norm": 0.03781810402870178, + "learning_rate": 1.9314141546972345e-06, + "loss": 0.0003, + "step": 304 + }, + { + "epoch": 4.0131578947368425, + "grad_norm": 0.028575751930475235, + "learning_rate": 1.9099529895768552e-06, + "loss": 0.0002, + "step": 305 + }, + { + "epoch": 4.026315789473684, + "grad_norm": 0.007214389741420746, + "learning_rate": 1.8885377742480005e-06, + "loss": 0.0, + "step": 306 + }, + { + "epoch": 4.0394736842105265, + "grad_norm": 0.021563267335295677, + "learning_rate": 1.8671701764162287e-06, + "loss": 0.0001, + "step": 307 + }, + { + "epoch": 4.052631578947368, + "grad_norm": 0.014376094564795494, + "learning_rate": 1.8458518600788988e-06, + "loss": 0.0001, + "step": 308 + }, + { + "epoch": 4.065789473684211, + "grad_norm": 0.010882866568863392, + "learning_rate": 1.8245844853955786e-06, + "loss": 0.0, + "step": 309 + }, + { + "epoch": 4.078947368421052, + "grad_norm": 0.007655533961951733, + "learning_rate": 1.8033697085587698e-06, + "loss": 0.0, + "step": 310 + }, + { + "epoch": 4.092105263157895, + "grad_norm": 0.004504406359046698, + "learning_rate": 1.782209181664924e-06, + "loss": 0.0, + "step": 311 + }, + { + "epoch": 4.105263157894737, + "grad_norm": 0.00908347126096487, + "learning_rate": 1.7611045525857902e-06, + "loss": 0.0, + "step": 312 + }, + { + "epoch": 4.118421052631579, + "grad_norm": 0.012282164767384529, + "learning_rate": 1.740057464840088e-06, + "loss": 0.0, + "step": 313 + }, + { + "epoch": 4.131578947368421, + "grad_norm": 0.006549667567014694, + "learning_rate": 1.7190695574655147e-06, + "loss": 0.0, + "step": 314 + }, + { + "epoch": 4.144736842105263, + "grad_norm": 0.025752369314432144, + "learning_rate": 1.6981424648911112e-06, + "loss": 0.0001, + "step": 315 + }, + { + "epoch": 4.157894736842105, + "grad_norm": 0.0477922260761261, + "learning_rate": 1.677277816809975e-06, + "loss": 0.0002, + "step": 316 + }, + { + "epoch": 4.171052631578948, + "grad_norm": 0.026114268228411674, + "learning_rate": 1.6564772380523546e-06, + "loss": 0.0001, + "step": 317 + }, + { + "epoch": 4.184210526315789, + "grad_norm": 0.02332746423780918, + "learning_rate": 1.635742348459109e-06, + "loss": 0.0001, + "step": 318 + }, + { + "epoch": 4.197368421052632, + "grad_norm": 0.008722717873752117, + "learning_rate": 1.6150747627555713e-06, + "loss": 0.0, + "step": 319 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 0.004474959336221218, + "learning_rate": 1.5944760904257944e-06, + "loss": 0.0, + "step": 320 + }, + { + "epoch": 4.223684210526316, + "grad_norm": 0.0274661872535944, + "learning_rate": 1.5739479355872162e-06, + "loss": 0.0001, + "step": 321 + }, + { + "epoch": 4.2368421052631575, + "grad_norm": 0.008118880912661552, + "learning_rate": 1.5534918968657423e-06, + "loss": 0.0, + "step": 322 + }, + { + "epoch": 4.25, + "grad_norm": 0.011024386622011662, + "learning_rate": 1.5331095672712463e-06, + "loss": 0.0001, + "step": 323 + }, + { + "epoch": 4.2631578947368425, + "grad_norm": 0.00726787094026804, + "learning_rate": 1.5128025340735223e-06, + "loss": 0.0, + "step": 324 + }, + { + "epoch": 4.276315789473684, + "grad_norm": 0.003688193392008543, + "learning_rate": 1.4925723786786691e-06, + "loss": 0.0, + "step": 325 + }, + { + "epoch": 4.2894736842105265, + "grad_norm": 0.045026637613773346, + "learning_rate": 1.4724206765059456e-06, + "loss": 0.0002, + "step": 326 + }, + { + "epoch": 4.302631578947368, + "grad_norm": 0.004103303886950016, + "learning_rate": 1.4523489968650795e-06, + "loss": 0.0, + "step": 327 + }, + { + "epoch": 4.315789473684211, + "grad_norm": 0.0039488510228693485, + "learning_rate": 1.4323589028340598e-06, + "loss": 0.0, + "step": 328 + }, + { + "epoch": 4.328947368421053, + "grad_norm": 0.004720236640423536, + "learning_rate": 1.4124519511374158e-06, + "loss": 0.0, + "step": 329 + }, + { + "epoch": 4.342105263157895, + "grad_norm": 0.004932956770062447, + "learning_rate": 1.3926296920249796e-06, + "loss": 0.0, + "step": 330 + }, + { + "epoch": 4.355263157894737, + "grad_norm": 0.017614832147955894, + "learning_rate": 1.3728936691511704e-06, + "loss": 0.0001, + "step": 331 + }, + { + "epoch": 4.368421052631579, + "grad_norm": 0.004803107585757971, + "learning_rate": 1.3532454194547734e-06, + "loss": 0.0, + "step": 332 + }, + { + "epoch": 4.381578947368421, + "grad_norm": 0.028480617329478264, + "learning_rate": 1.3336864730392587e-06, + "loss": 0.0001, + "step": 333 + }, + { + "epoch": 4.394736842105263, + "grad_norm": 0.0031023717019706964, + "learning_rate": 1.314218353053619e-06, + "loss": 0.0, + "step": 334 + }, + { + "epoch": 4.407894736842105, + "grad_norm": 0.008832458406686783, + "learning_rate": 1.2948425755737592e-06, + "loss": 0.0, + "step": 335 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 0.005478347185999155, + "learning_rate": 1.2755606494844294e-06, + "loss": 0.0, + "step": 336 + }, + { + "epoch": 4.434210526315789, + "grad_norm": 0.012723652645945549, + "learning_rate": 1.2563740763617198e-06, + "loss": 0.0, + "step": 337 + }, + { + "epoch": 4.447368421052632, + "grad_norm": 0.007557735778391361, + "learning_rate": 1.2372843503561318e-06, + "loss": 0.0, + "step": 338 + }, + { + "epoch": 4.4605263157894735, + "grad_norm": 0.02057843655347824, + "learning_rate": 1.218292958076213e-06, + "loss": 0.0001, + "step": 339 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 0.006118930876255035, + "learning_rate": 1.1994013784727948e-06, + "loss": 0.0, + "step": 340 + }, + { + "epoch": 4.4868421052631575, + "grad_norm": 0.0338875874876976, + "learning_rate": 1.180611082723814e-06, + "loss": 0.0, + "step": 341 + }, + { + "epoch": 4.5, + "grad_norm": 0.01357252337038517, + "learning_rate": 1.161923534119752e-06, + "loss": 0.0001, + "step": 342 + }, + { + "epoch": 4.5131578947368425, + "grad_norm": 0.014216500334441662, + "learning_rate": 1.1433401879496723e-06, + "loss": 0.0001, + "step": 343 + }, + { + "epoch": 4.526315789473684, + "grad_norm": 0.004121949430555105, + "learning_rate": 1.1248624913878966e-06, + "loss": 0.0, + "step": 344 + }, + { + "epoch": 4.5394736842105265, + "grad_norm": 0.005702656228095293, + "learning_rate": 1.1064918833813073e-06, + "loss": 0.0, + "step": 345 + }, + { + "epoch": 4.552631578947368, + "grad_norm": 0.0023520493414252996, + "learning_rate": 1.088229794537283e-06, + "loss": 0.0, + "step": 346 + }, + { + "epoch": 4.565789473684211, + "grad_norm": 0.0023677553981542587, + "learning_rate": 1.0700776470122981e-06, + "loss": 0.0, + "step": 347 + }, + { + "epoch": 4.578947368421053, + "grad_norm": 0.01764129102230072, + "learning_rate": 1.0520368544011661e-06, + "loss": 0.0001, + "step": 348 + }, + { + "epoch": 4.592105263157895, + "grad_norm": 0.008476192131638527, + "learning_rate": 1.0341088216269625e-06, + "loss": 0.0, + "step": 349 + }, + { + "epoch": 4.605263157894737, + "grad_norm": 0.0017920136451721191, + "learning_rate": 1.0162949448316089e-06, + "loss": 0.0, + "step": 350 + }, + { + "epoch": 4.618421052631579, + "grad_norm": 0.0061067817732691765, + "learning_rate": 9.98596611267158e-07, + "loss": 0.0, + "step": 351 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 0.004206137731671333, + "learning_rate": 9.81015199187753e-07, + "loss": 0.0, + "step": 352 + }, + { + "epoch": 4.644736842105263, + "grad_norm": 0.02187529392540455, + "learning_rate": 9.63552077742301e-07, + "loss": 0.0, + "step": 353 + }, + { + "epoch": 4.657894736842105, + "grad_norm": 0.0074692717753350735, + "learning_rate": 9.462086068678519e-07, + "loss": 0.0, + "step": 354 + }, + { + "epoch": 4.671052631578947, + "grad_norm": 0.004613088443875313, + "learning_rate": 9.289861371836886e-07, + "loss": 0.0, + "step": 355 + }, + { + "epoch": 4.684210526315789, + "grad_norm": 0.0051763323135674, + "learning_rate": 9.118860098861538e-07, + "loss": 0.0, + "step": 356 + }, + { + "epoch": 4.697368421052632, + "grad_norm": 0.004479160998016596, + "learning_rate": 8.949095566441985e-07, + "loss": 0.0, + "step": 357 + }, + { + "epoch": 4.7105263157894735, + "grad_norm": 0.0031334473751485348, + "learning_rate": 8.78058099495685e-07, + "loss": 0.0, + "step": 358 + }, + { + "epoch": 4.723684210526316, + "grad_norm": 0.005216387566179037, + "learning_rate": 8.613329507444274e-07, + "loss": 0.0, + "step": 359 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 0.0030827040318399668, + "learning_rate": 8.44735412857999e-07, + "loss": 0.0, + "step": 360 + }, + { + "epoch": 4.75, + "grad_norm": 0.008658899925649166, + "learning_rate": 8.282667783663056e-07, + "loss": 0.0, + "step": 361 + }, + { + "epoch": 4.7631578947368425, + "grad_norm": 0.00587991438806057, + "learning_rate": 8.119283297609238e-07, + "loss": 0.0, + "step": 362 + }, + { + "epoch": 4.776315789473684, + "grad_norm": 0.01003989763557911, + "learning_rate": 7.957213393952335e-07, + "loss": 0.0, + "step": 363 + }, + { + "epoch": 4.7894736842105265, + "grad_norm": 0.005186410155147314, + "learning_rate": 7.796470693853281e-07, + "loss": 0.0, + "step": 364 + }, + { + "epoch": 4.802631578947368, + "grad_norm": 0.0056229582987725735, + "learning_rate": 7.637067715117327e-07, + "loss": 0.0, + "step": 365 + }, + { + "epoch": 4.815789473684211, + "grad_norm": 0.002910787472501397, + "learning_rate": 7.479016871219174e-07, + "loss": 0.0, + "step": 366 + }, + { + "epoch": 4.828947368421053, + "grad_norm": 0.025458911433815956, + "learning_rate": 7.322330470336314e-07, + "loss": 0.0001, + "step": 367 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 0.008555232547223568, + "learning_rate": 7.167020714390502e-07, + "loss": 0.0, + "step": 368 + }, + { + "epoch": 4.855263157894737, + "grad_norm": 0.012796996161341667, + "learning_rate": 7.013099698097539e-07, + "loss": 0.0, + "step": 369 + }, + { + "epoch": 4.868421052631579, + "grad_norm": 0.008186944760382175, + "learning_rate": 6.860579408025436e-07, + "loss": 0.0, + "step": 370 + }, + { + "epoch": 4.881578947368421, + "grad_norm": 0.008431337773799896, + "learning_rate": 6.709471721660904e-07, + "loss": 0.0, + "step": 371 + }, + { + "epoch": 4.894736842105263, + "grad_norm": 0.021517649292945862, + "learning_rate": 6.559788406484446e-07, + "loss": 0.0001, + "step": 372 + }, + { + "epoch": 4.907894736842105, + "grad_norm": 0.008807409554719925, + "learning_rate": 6.41154111905393e-07, + "loss": 0.0, + "step": 373 + }, + { + "epoch": 4.921052631578947, + "grad_norm": 0.028927143663167953, + "learning_rate": 6.264741404096875e-07, + "loss": 0.0001, + "step": 374 + }, + { + "epoch": 4.934210526315789, + "grad_norm": 0.008634347468614578, + "learning_rate": 6.119400693611358e-07, + "loss": 0.0, + "step": 375 + }, + { + "epoch": 4.947368421052632, + "grad_norm": 0.0028107326943427324, + "learning_rate": 5.975530305975808e-07, + "loss": 0.0, + "step": 376 + }, + { + "epoch": 4.9605263157894735, + "grad_norm": 0.0016718370607122779, + "learning_rate": 5.833141445067541e-07, + "loss": 0.0, + "step": 377 + }, + { + "epoch": 4.973684210526316, + "grad_norm": 0.007590409368276596, + "learning_rate": 5.692245199390281e-07, + "loss": 0.0, + "step": 378 + }, + { + "epoch": 4.9868421052631575, + "grad_norm": 0.003609555773437023, + "learning_rate": 5.552852541210651e-07, + "loss": 0.0, + "step": 379 + }, + { + "epoch": 5.0, + "grad_norm": 0.03264299035072327, + "learning_rate": 5.414974325703687e-07, + "loss": 0.0001, + "step": 380 + } + ], + "logging_steps": 1, + "max_steps": 456, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 76, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4446096401561027e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-380/training_args.bin b/checkpoint-380/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bd15c797edde13f0f3e0490d0aec249c013df912 --- /dev/null +++ b/checkpoint-380/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ebcaf257fe89a74904f6ea50b526a559eb74a53ebc4dfb373932a4d0fa515f5 +size 7928 diff --git a/checkpoint-380/zero_to_fp32.py b/checkpoint-380/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-380/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-76/README.md b/checkpoint-76/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1b184114a0c28ed3e4c082c18486736dc818166d --- /dev/null +++ b/checkpoint-76/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.3-70B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-76/adapter_config.json b/checkpoint-76/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..763d98264e556bcbc60c63c5b9f70b53c7bbe722 --- /dev/null +++ b/checkpoint-76/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.3-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "up_proj", + "k_proj", + "q_proj", + "down_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-76/adapter_model.safetensors b/checkpoint-76/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..db58bd213b3090ac6c620e1a62b8c7a7d6ffe43e --- /dev/null +++ b/checkpoint-76/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0adf11ea8989f8925fd2a1c5ea841eec2675aef8b0edf99afc159d34fbdf512d +size 10829849744 diff --git a/checkpoint-76/global_step76/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-76/global_step76/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9284b074cc9938abb3904f0f98bca8fd9c20172d --- /dev/null +++ b/checkpoint-76/global_step76/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a24f96602c436bd3643e2ebd7bb35792c78d154ea76be6b1ec0cf0434aac0a4 +size 21659418140 diff --git a/checkpoint-76/global_step76/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-76/global_step76/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..454bcb1e198a56ccfa782c501379e94e0c1048a0 --- /dev/null +++ b/checkpoint-76/global_step76/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:540de66bfffd0df8dba29a777ffca49e244a43e33fac83bdeb116c74ca6c603d +size 21659457372 diff --git a/checkpoint-76/global_step76/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-76/global_step76/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4831774b32b17b6fdb0e5300518c95390a15ce53 --- /dev/null +++ b/checkpoint-76/global_step76/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9f8de718771f2b937e78986446656e9bc408f1b5aa039ab191dccee57efd50 +size 21659417820 diff --git a/checkpoint-76/global_step76/mp_rank_00_model_states.pt b/checkpoint-76/global_step76/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..79b7e8051b705f1e754c936c51339d343fdd39ae --- /dev/null +++ b/checkpoint-76/global_step76/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7353e33af25bbe1dc5f514aa7f92a6f391ca5f640fc22416b0551e0a03950405 +size 11918643933 diff --git a/checkpoint-76/latest b/checkpoint-76/latest new file mode 100644 index 0000000000000000000000000000000000000000..3137f19948d5aa563b9948e1161e2ee9665c4f33 --- /dev/null +++ b/checkpoint-76/latest @@ -0,0 +1 @@ +global_step76 \ No newline at end of file diff --git a/checkpoint-76/rng_state_0.pth b/checkpoint-76/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..f0243a35107f9351684b4b491b92475dc82efd3d --- /dev/null +++ b/checkpoint-76/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4caef4c46f6d195a03b7ac9efc175ac61a61d226f3ba835e0fb9bac39e6bc64 +size 14768 diff --git a/checkpoint-76/rng_state_1.pth b/checkpoint-76/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..81018554b8d83534e404729a34be576aba370f9e --- /dev/null +++ b/checkpoint-76/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33674d0cb90389e7c73070665b3a44c29b2a2e3d5ae9dd280aeddf03fcad3db6 +size 14768 diff --git a/checkpoint-76/rng_state_2.pth b/checkpoint-76/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..cb9fddced9c2ef113e1fe83cb8d35aae87b9c46a --- /dev/null +++ b/checkpoint-76/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f86d3f237c14ca71c248f48fd103d5f3f60e3d6f92df22f71396c5e09ff918ae +size 14768 diff --git a/checkpoint-76/scheduler.pt b/checkpoint-76/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7c7c864b80bf300100f0f6e4794b1332e6b06fe --- /dev/null +++ b/checkpoint-76/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c315fe8040ff17806bbc85e0852bb90368308daf9b2756cfe9934a4972441ebd +size 1064 diff --git a/checkpoint-76/special_tokens_map.json b/checkpoint-76/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-76/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-76/tokenizer.json b/checkpoint-76/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-76/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-76/tokenizer_config.json b/checkpoint-76/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-76/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-76/trainer_state.json b/checkpoint-76/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fb58c593aa737ea599c5058db5a382067a9ba983 --- /dev/null +++ b/checkpoint-76/trainer_state.json @@ -0,0 +1,565 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 76, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013157894736842105, + "grad_norm": 37.79440689086914, + "learning_rate": 5.0000000000000004e-08, + "loss": 3.1402, + "step": 1 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 38.45823287963867, + "learning_rate": 1.0000000000000001e-07, + "loss": 3.1787, + "step": 2 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 38.25625228881836, + "learning_rate": 1.5000000000000002e-07, + "loss": 3.1316, + "step": 3 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 37.2024040222168, + "learning_rate": 2.0000000000000002e-07, + "loss": 3.1011, + "step": 4 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 38.17294692993164, + "learning_rate": 2.5000000000000004e-07, + "loss": 3.133, + "step": 5 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 37.374794006347656, + "learning_rate": 3.0000000000000004e-07, + "loss": 3.0731, + "step": 6 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 37.226966857910156, + "learning_rate": 3.5000000000000004e-07, + "loss": 3.069, + "step": 7 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 38.40094757080078, + "learning_rate": 4.0000000000000003e-07, + "loss": 3.1223, + "step": 8 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 37.86320877075195, + "learning_rate": 4.5000000000000003e-07, + "loss": 3.062, + "step": 9 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 38.02171325683594, + "learning_rate": 5.000000000000001e-07, + "loss": 3.0008, + "step": 10 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 38.5522346496582, + "learning_rate": 5.5e-07, + "loss": 3.0047, + "step": 11 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 37.72829818725586, + "learning_rate": 6.000000000000001e-07, + "loss": 2.9274, + "step": 12 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 38.488494873046875, + "learning_rate": 6.5e-07, + "loss": 2.8727, + "step": 13 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 38.87471389770508, + "learning_rate": 7.000000000000001e-07, + "loss": 2.8422, + "step": 14 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 37.584896087646484, + "learning_rate": 7.5e-07, + "loss": 2.6728, + "step": 15 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 37.04607391357422, + "learning_rate": 8.000000000000001e-07, + "loss": 2.5215, + "step": 16 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 37.30121994018555, + "learning_rate": 8.500000000000001e-07, + "loss": 2.4689, + "step": 17 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 35.99961853027344, + "learning_rate": 9.000000000000001e-07, + "loss": 2.3, + "step": 18 + }, + { + "epoch": 0.25, + "grad_norm": 35.817543029785156, + "learning_rate": 9.500000000000001e-07, + "loss": 2.1423, + "step": 19 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 35.056915283203125, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.9639, + "step": 20 + }, + { + "epoch": 0.27631578947368424, + "grad_norm": 34.83850860595703, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.7845, + "step": 21 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 34.32366943359375, + "learning_rate": 1.1e-06, + "loss": 1.5864, + "step": 22 + }, + { + "epoch": 0.3026315789473684, + "grad_norm": 33.79611587524414, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.4011, + "step": 23 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 32.596031188964844, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.195, + "step": 24 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 30.045007705688477, + "learning_rate": 1.25e-06, + "loss": 0.9883, + "step": 25 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 24.89093589782715, + "learning_rate": 1.3e-06, + "loss": 0.7669, + "step": 26 + }, + { + "epoch": 0.35526315789473684, + "grad_norm": 23.454408645629883, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.6304, + "step": 27 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 19.837312698364258, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.4717, + "step": 28 + }, + { + "epoch": 0.3815789473684211, + "grad_norm": 15.185093879699707, + "learning_rate": 1.45e-06, + "loss": 0.363, + "step": 29 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 9.057796478271484, + "learning_rate": 1.5e-06, + "loss": 0.2439, + "step": 30 + }, + { + "epoch": 0.40789473684210525, + "grad_norm": 5.976982593536377, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1864, + "step": 31 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 3.067375421524048, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1134, + "step": 32 + }, + { + "epoch": 0.4342105263157895, + "grad_norm": 2.3589119911193848, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0985, + "step": 33 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 2.0044353008270264, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0859, + "step": 34 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 1.4279972314834595, + "learning_rate": 1.75e-06, + "loss": 0.0728, + "step": 35 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.9807674288749695, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.061, + "step": 36 + }, + { + "epoch": 0.4868421052631579, + "grad_norm": 0.906160295009613, + "learning_rate": 1.85e-06, + "loss": 0.0676, + "step": 37 + }, + { + "epoch": 0.5, + "grad_norm": 0.8837690353393555, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0622, + "step": 38 + }, + { + "epoch": 0.5131578947368421, + "grad_norm": 0.9579435586929321, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0557, + "step": 39 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.8149510622024536, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0555, + "step": 40 + }, + { + "epoch": 0.5394736842105263, + "grad_norm": 0.8899760246276855, + "learning_rate": 2.05e-06, + "loss": 0.0517, + "step": 41 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.6007645130157471, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0518, + "step": 42 + }, + { + "epoch": 0.5657894736842105, + "grad_norm": 0.48819127678871155, + "learning_rate": 2.15e-06, + "loss": 0.0429, + "step": 43 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.42939358949661255, + "learning_rate": 2.2e-06, + "loss": 0.0459, + "step": 44 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.5706579685211182, + "learning_rate": 2.25e-06, + "loss": 0.0453, + "step": 45 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.3034597337245941, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0421, + "step": 46 + }, + { + "epoch": 0.618421052631579, + "grad_norm": 0.5601783394813538, + "learning_rate": 2.35e-06, + "loss": 0.0411, + "step": 47 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.35388317704200745, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.04, + "step": 48 + }, + { + "epoch": 0.6447368421052632, + "grad_norm": 0.48609891533851624, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.04, + "step": 49 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.4638507068157196, + "learning_rate": 2.5e-06, + "loss": 0.0369, + "step": 50 + }, + { + "epoch": 0.6710526315789473, + "grad_norm": 0.5685771703720093, + "learning_rate": 2.55e-06, + "loss": 0.0428, + "step": 51 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.46358174085617065, + "learning_rate": 2.6e-06, + "loss": 0.0483, + "step": 52 + }, + { + "epoch": 0.6973684210526315, + "grad_norm": 0.35054436326026917, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0391, + "step": 53 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.3350559175014496, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.039, + "step": 54 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 0.2875112295150757, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0383, + "step": 55 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.4492928683757782, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0358, + "step": 56 + }, + { + "epoch": 0.75, + "grad_norm": 0.29484888911247253, + "learning_rate": 2.85e-06, + "loss": 0.0355, + "step": 57 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.36551928520202637, + "learning_rate": 2.9e-06, + "loss": 0.0403, + "step": 58 + }, + { + "epoch": 0.7763157894736842, + "grad_norm": 0.4458053708076477, + "learning_rate": 2.95e-06, + "loss": 0.0342, + "step": 59 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.34047460556030273, + "learning_rate": 3e-06, + "loss": 0.0302, + "step": 60 + }, + { + "epoch": 0.8026315789473685, + "grad_norm": 0.3420606255531311, + "learning_rate": 3.05e-06, + "loss": 0.034, + "step": 61 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.3902851939201355, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0327, + "step": 62 + }, + { + "epoch": 0.8289473684210527, + "grad_norm": 0.29165828227996826, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0341, + "step": 63 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.40872958302497864, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.035, + "step": 64 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 0.36295783519744873, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0323, + "step": 65 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.3857724368572235, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0336, + "step": 66 + }, + { + "epoch": 0.881578947368421, + "grad_norm": 0.3207017481327057, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0332, + "step": 67 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.2903987169265747, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0327, + "step": 68 + }, + { + "epoch": 0.9078947368421053, + "grad_norm": 0.3386954963207245, + "learning_rate": 3.45e-06, + "loss": 0.0308, + "step": 69 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.4339621365070343, + "learning_rate": 3.5e-06, + "loss": 0.0361, + "step": 70 + }, + { + "epoch": 0.9342105263157895, + "grad_norm": 0.28095564246177673, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0306, + "step": 71 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.4141469895839691, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.028, + "step": 72 + }, + { + "epoch": 0.9605263157894737, + "grad_norm": 0.35212820768356323, + "learning_rate": 3.65e-06, + "loss": 0.032, + "step": 73 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.26956063508987427, + "learning_rate": 3.7e-06, + "loss": 0.0294, + "step": 74 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 0.32735681533813477, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.0272, + "step": 75 + }, + { + "epoch": 1.0, + "grad_norm": 0.4906782805919647, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0324, + "step": 76 + } + ], + "logging_steps": 1, + "max_steps": 456, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 76, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.889219280312205e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-76/training_args.bin b/checkpoint-76/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bd15c797edde13f0f3e0490d0aec249c013df912 --- /dev/null +++ b/checkpoint-76/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ebcaf257fe89a74904f6ea50b526a559eb74a53ebc4dfb373932a4d0fa515f5 +size 7928 diff --git a/checkpoint-76/zero_to_fp32.py b/checkpoint-76/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-76/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9d764aead97a826f7aae3b9cfcfe2606e1d2eeec --- /dev/null +++ b/config.json @@ -0,0 +1,52 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.3-70B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 8192, + "initializer_range": 0.02, + "intermediate_size": 28672, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 64, + "num_hidden_layers": 80, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "quantization_config": { + "_load_in_4bit": true, + "_load_in_8bit": false, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_storage": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "llm_int8_enable_fp32_cpu_offload": false, + "llm_int8_has_fp16_weight": false, + "llm_int8_skip_modules": null, + "llm_int8_threshold": 6.0, + "load_in_4bit": true, + "load_in_8bit": false, + "quant_method": "bitsandbytes" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.49.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +}