diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..c5ab07a8ec731d13c6bb1c7ef39ee4a5b356a417 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-144/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-216/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-288/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-360/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-72/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b280adcf5b1303ae3e5d7117b690cc14a4eaec2b --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "o_proj", + "v_proj", + "q_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-144/README.md b/checkpoint-144/README.md new file mode 100644 index 0000000000000000000000000000000000000000..037e1a543b9c1891b5c6981f89d5b7c7c9a907ae --- /dev/null +++ b/checkpoint-144/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-70B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-144/adapter_config.json b/checkpoint-144/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3295690c20dff5dc1d1f30f8500f0efb7e255838 --- /dev/null +++ b/checkpoint-144/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "gate_proj", + "v_proj", + "up_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-144/adapter_model.safetensors b/checkpoint-144/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3adda9eba7b361be3b649abcd467a4f29578c3c0 --- /dev/null +++ b/checkpoint-144/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6889a110b91372dfc2bb9d9f07f682f3ffcce8505520a3991d1df27d290beeb7 +size 10829849744 diff --git a/checkpoint-144/global_step143/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-144/global_step143/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..464fbd6b430bf2146b64a6fbf0b7fc704ddcfa93 --- /dev/null +++ b/checkpoint-144/global_step143/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79dd1236d82bf8e6ec20246c52a7f901209b663da9e78f0635c5f0a85faa8f5e +size 21659418140 diff --git a/checkpoint-144/global_step143/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-144/global_step143/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc0011ef6b84e337e5b8a46d0873c53975bcdd38 --- /dev/null +++ b/checkpoint-144/global_step143/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01bf9014270a3e4333f8397a8f0fd9b8a18703081b92fb66f22de4f18b8425c6 +size 21659457372 diff --git a/checkpoint-144/global_step143/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-144/global_step143/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15c1435f7c39142cc29206a50287dca4fb728a97 --- /dev/null +++ b/checkpoint-144/global_step143/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:249da40cad3ec1b2f1ff2d06a404bcc1aadb5530f31021b2494da08866f1b5ad +size 21659417820 diff --git a/checkpoint-144/global_step143/mp_rank_00_model_states.pt b/checkpoint-144/global_step143/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9103b4ea50dec996a583560189b544b38730b7a9 --- /dev/null +++ b/checkpoint-144/global_step143/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01664b3e5314aa90d6c422ecda42e07d4c878f2b6340eb3587f1e4eab1a4e4b4 +size 11918643933 diff --git a/checkpoint-144/latest b/checkpoint-144/latest new file mode 100644 index 0000000000000000000000000000000000000000..93407f5a9fdef065b428ddd4b9440e88eb65a982 --- /dev/null +++ b/checkpoint-144/latest @@ -0,0 +1 @@ +global_step143 \ No newline at end of file diff --git a/checkpoint-144/rng_state_0.pth b/checkpoint-144/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..64b63e44f4cfdd29a0ce453bd6c6ce36968570bc --- /dev/null +++ b/checkpoint-144/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52cfa88e96395a9f554de4f79c4baba7be8d9940fe5b00d6c840fc070c9e0871 +size 14768 diff --git a/checkpoint-144/rng_state_1.pth b/checkpoint-144/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d0c1d24cbdbe1b77b5ac7d8dfa649c9cc6d80415 --- /dev/null +++ b/checkpoint-144/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f675af0d887993afac3122873bcddaa14afdea8fb3ff46a0ea096b2acca2bc0f +size 14768 diff --git a/checkpoint-144/rng_state_2.pth b/checkpoint-144/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9bae09d9af3d65c424cb5cf304472803673786f0 --- /dev/null +++ b/checkpoint-144/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f112bdcf15570488162c8646b9e1e3c10f4135c8f174cc0118c1172493350e4e +size 14768 diff --git a/checkpoint-144/scheduler.pt b/checkpoint-144/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2dde2b518bcd4fd63433ef1bc0e8e93df9dd450 --- /dev/null +++ b/checkpoint-144/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3c70fd0b9904cd462c44b91c3a6304d50f808b742c89f710f481983baa6497c +size 1064 diff --git a/checkpoint-144/special_tokens_map.json b/checkpoint-144/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-144/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-144/tokenizer.json b/checkpoint-144/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-144/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-144/tokenizer_config.json b/checkpoint-144/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-144/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-144/trainer_state.json b/checkpoint-144/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..add5f06ff0b6ae1ccd65e308cf14c4b3cb2d25e2 --- /dev/null +++ b/checkpoint-144/trainer_state.json @@ -0,0 +1,1041 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9815668202764978, + "eval_steps": 500, + "global_step": 144, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013824884792626729, + "grad_norm": 31.00213623046875, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2089, + "step": 1 + }, + { + "epoch": 0.027649769585253458, + "grad_norm": 30.27136993408203, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.1536, + "step": 2 + }, + { + "epoch": 0.041474654377880185, + "grad_norm": 30.48703384399414, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.1581, + "step": 3 + }, + { + "epoch": 0.055299539170506916, + "grad_norm": 30.779329299926758, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.1741, + "step": 4 + }, + { + "epoch": 0.06912442396313365, + "grad_norm": 31.22808837890625, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.1864, + "step": 5 + }, + { + "epoch": 0.08294930875576037, + "grad_norm": 30.783327102661133, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.1993, + "step": 6 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 30.57423210144043, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.1506, + "step": 7 + }, + { + "epoch": 0.11059907834101383, + "grad_norm": 30.952186584472656, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.1599, + "step": 8 + }, + { + "epoch": 0.12442396313364056, + "grad_norm": 30.37245750427246, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.1572, + "step": 9 + }, + { + "epoch": 0.1382488479262673, + "grad_norm": 30.930192947387695, + "learning_rate": 5.000000000000001e-07, + "loss": 2.1447, + "step": 10 + }, + { + "epoch": 0.15207373271889402, + "grad_norm": 29.735448837280273, + "learning_rate": 5.5e-07, + "loss": 2.0742, + "step": 11 + }, + { + "epoch": 0.16589861751152074, + "grad_norm": 29.62826156616211, + "learning_rate": 6.000000000000001e-07, + "loss": 2.061, + "step": 12 + }, + { + "epoch": 0.17972350230414746, + "grad_norm": 28.937463760375977, + "learning_rate": 6.5e-07, + "loss": 1.9974, + "step": 13 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 29.24833106994629, + "learning_rate": 7.000000000000001e-07, + "loss": 1.9833, + "step": 14 + }, + { + "epoch": 0.2073732718894009, + "grad_norm": 28.122018814086914, + "learning_rate": 7.5e-07, + "loss": 1.8934, + "step": 15 + }, + { + "epoch": 0.22119815668202766, + "grad_norm": 28.059659957885742, + "learning_rate": 8.000000000000001e-07, + "loss": 1.875, + "step": 16 + }, + { + "epoch": 0.2350230414746544, + "grad_norm": 27.361961364746094, + "learning_rate": 8.500000000000001e-07, + "loss": 1.8009, + "step": 17 + }, + { + "epoch": 0.2488479262672811, + "grad_norm": 26.721765518188477, + "learning_rate": 9.000000000000001e-07, + "loss": 1.7116, + "step": 18 + }, + { + "epoch": 0.2626728110599078, + "grad_norm": 25.37330436706543, + "learning_rate": 9.500000000000001e-07, + "loss": 1.5608, + "step": 19 + }, + { + "epoch": 0.2764976958525346, + "grad_norm": 25.81206703186035, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5043, + "step": 20 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 25.539344787597656, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.3673, + "step": 21 + }, + { + "epoch": 0.30414746543778803, + "grad_norm": 25.097164154052734, + "learning_rate": 1.1e-06, + "loss": 1.2029, + "step": 22 + }, + { + "epoch": 0.31797235023041476, + "grad_norm": 24.619497299194336, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.0458, + "step": 23 + }, + { + "epoch": 0.3317972350230415, + "grad_norm": 23.820302963256836, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.8723, + "step": 24 + }, + { + "epoch": 0.3456221198156682, + "grad_norm": 23.12735939025879, + "learning_rate": 1.25e-06, + "loss": 0.7183, + "step": 25 + }, + { + "epoch": 0.35944700460829493, + "grad_norm": 20.127134323120117, + "learning_rate": 1.3e-06, + "loss": 0.5248, + "step": 26 + }, + { + "epoch": 0.37327188940092165, + "grad_norm": 15.901495933532715, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.3689, + "step": 27 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 11.053832054138184, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.2482, + "step": 28 + }, + { + "epoch": 0.4009216589861751, + "grad_norm": 7.248495578765869, + "learning_rate": 1.45e-06, + "loss": 0.1847, + "step": 29 + }, + { + "epoch": 0.4147465437788018, + "grad_norm": 5.378540515899658, + "learning_rate": 1.5e-06, + "loss": 0.1423, + "step": 30 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 3.8371808528900146, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1152, + "step": 31 + }, + { + "epoch": 0.4423963133640553, + "grad_norm": 2.2655274868011475, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0845, + "step": 32 + }, + { + "epoch": 0.45622119815668205, + "grad_norm": 1.5746861696243286, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0711, + "step": 33 + }, + { + "epoch": 0.4700460829493088, + "grad_norm": 1.3510947227478027, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0734, + "step": 34 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.9737389087677002, + "learning_rate": 1.75e-06, + "loss": 0.0651, + "step": 35 + }, + { + "epoch": 0.4976958525345622, + "grad_norm": 0.9815284609794617, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0593, + "step": 36 + }, + { + "epoch": 0.511520737327189, + "grad_norm": 0.8567912578582764, + "learning_rate": 1.85e-06, + "loss": 0.0543, + "step": 37 + }, + { + "epoch": 0.5253456221198156, + "grad_norm": 0.6773302555084229, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0622, + "step": 38 + }, + { + "epoch": 0.5391705069124424, + "grad_norm": 0.49936285614967346, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0511, + "step": 39 + }, + { + "epoch": 0.5529953917050692, + "grad_norm": 0.6253588795661926, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0478, + "step": 40 + }, + { + "epoch": 0.5668202764976958, + "grad_norm": 0.5103089809417725, + "learning_rate": 2.05e-06, + "loss": 0.0465, + "step": 41 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.29294702410697937, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0456, + "step": 42 + }, + { + "epoch": 0.5944700460829493, + "grad_norm": 0.4237954616546631, + "learning_rate": 2.15e-06, + "loss": 0.0501, + "step": 43 + }, + { + "epoch": 0.6082949308755761, + "grad_norm": 0.42243412137031555, + "learning_rate": 2.2e-06, + "loss": 0.0388, + "step": 44 + }, + { + "epoch": 0.6221198156682027, + "grad_norm": 0.37881818413734436, + "learning_rate": 2.25e-06, + "loss": 0.0415, + "step": 45 + }, + { + "epoch": 0.6359447004608295, + "grad_norm": 0.4941152036190033, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.045, + "step": 46 + }, + { + "epoch": 0.6497695852534562, + "grad_norm": 0.3046450912952423, + "learning_rate": 2.35e-06, + "loss": 0.0386, + "step": 47 + }, + { + "epoch": 0.663594470046083, + "grad_norm": 0.39361852407455444, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0447, + "step": 48 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.5190001130104065, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0364, + "step": 49 + }, + { + "epoch": 0.6912442396313364, + "grad_norm": 0.372072696685791, + "learning_rate": 2.5e-06, + "loss": 0.043, + "step": 50 + }, + { + "epoch": 0.7050691244239631, + "grad_norm": 0.3756551146507263, + "learning_rate": 2.55e-06, + "loss": 0.0424, + "step": 51 + }, + { + "epoch": 0.7188940092165899, + "grad_norm": 0.4593554437160492, + "learning_rate": 2.6e-06, + "loss": 0.0387, + "step": 52 + }, + { + "epoch": 0.7327188940092166, + "grad_norm": 0.2931855618953705, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0396, + "step": 53 + }, + { + "epoch": 0.7465437788018433, + "grad_norm": 0.38429534435272217, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0373, + "step": 54 + }, + { + "epoch": 0.7603686635944701, + "grad_norm": 0.3506857752799988, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.04, + "step": 55 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.29847028851509094, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0369, + "step": 56 + }, + { + "epoch": 0.7880184331797235, + "grad_norm": 0.3653375506401062, + "learning_rate": 2.85e-06, + "loss": 0.0396, + "step": 57 + }, + { + "epoch": 0.8018433179723502, + "grad_norm": 0.3163083791732788, + "learning_rate": 2.9e-06, + "loss": 0.0337, + "step": 58 + }, + { + "epoch": 0.815668202764977, + "grad_norm": 0.3734363615512848, + "learning_rate": 2.95e-06, + "loss": 0.0327, + "step": 59 + }, + { + "epoch": 0.8294930875576036, + "grad_norm": 0.29547712206840515, + "learning_rate": 3e-06, + "loss": 0.0365, + "step": 60 + }, + { + "epoch": 0.8433179723502304, + "grad_norm": 0.4041007161140442, + "learning_rate": 3.05e-06, + "loss": 0.038, + "step": 61 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.3602149784564972, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.033, + "step": 62 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 0.2948857545852661, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0386, + "step": 63 + }, + { + "epoch": 0.8847926267281107, + "grad_norm": 0.39098358154296875, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0323, + "step": 64 + }, + { + "epoch": 0.8986175115207373, + "grad_norm": 0.3692062795162201, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0309, + "step": 65 + }, + { + "epoch": 0.9124423963133641, + "grad_norm": 0.3967229425907135, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0346, + "step": 66 + }, + { + "epoch": 0.9262672811059908, + "grad_norm": 0.47776708006858826, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0355, + "step": 67 + }, + { + "epoch": 0.9400921658986175, + "grad_norm": 0.21545131504535675, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0294, + "step": 68 + }, + { + "epoch": 0.9539170506912442, + "grad_norm": 0.23738539218902588, + "learning_rate": 3.45e-06, + "loss": 0.0308, + "step": 69 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.29174014925956726, + "learning_rate": 3.5e-06, + "loss": 0.0312, + "step": 70 + }, + { + "epoch": 0.9815668202764977, + "grad_norm": 0.38475602865219116, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0324, + "step": 71 + }, + { + "epoch": 0.9953917050691244, + "grad_norm": 0.4077378809452057, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0297, + "step": 72 + }, + { + "epoch": 1.0, + "grad_norm": 0.4077378809452057, + "learning_rate": 3.65e-06, + "loss": 0.031, + "step": 73 + }, + { + "epoch": 1.0138248847926268, + "grad_norm": 0.46581539511680603, + "learning_rate": 3.7e-06, + "loss": 0.0313, + "step": 74 + }, + { + "epoch": 1.0276497695852536, + "grad_norm": 0.24417200684547424, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.027, + "step": 75 + }, + { + "epoch": 1.0414746543778801, + "grad_norm": 0.20425117015838623, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0307, + "step": 76 + }, + { + "epoch": 1.055299539170507, + "grad_norm": 0.3578161597251892, + "learning_rate": 3.85e-06, + "loss": 0.0312, + "step": 77 + }, + { + "epoch": 1.0691244239631337, + "grad_norm": 0.39486679434776306, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0294, + "step": 78 + }, + { + "epoch": 1.0829493087557605, + "grad_norm": 0.3932795226573944, + "learning_rate": 3.95e-06, + "loss": 0.0307, + "step": 79 + }, + { + "epoch": 1.096774193548387, + "grad_norm": 0.2946235239505768, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0257, + "step": 80 + }, + { + "epoch": 1.1105990783410138, + "grad_norm": 0.3318672776222229, + "learning_rate": 4.05e-06, + "loss": 0.0296, + "step": 81 + }, + { + "epoch": 1.1244239631336406, + "grad_norm": 0.23701588809490204, + "learning_rate": 4.1e-06, + "loss": 0.0298, + "step": 82 + }, + { + "epoch": 1.1382488479262673, + "grad_norm": 0.2415941059589386, + "learning_rate": 4.15e-06, + "loss": 0.0256, + "step": 83 + }, + { + "epoch": 1.1520737327188941, + "grad_norm": 0.24098087847232819, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0263, + "step": 84 + }, + { + "epoch": 1.1658986175115207, + "grad_norm": 0.3530103862285614, + "learning_rate": 4.25e-06, + "loss": 0.0308, + "step": 85 + }, + { + "epoch": 1.1797235023041475, + "grad_norm": 0.2382838875055313, + "learning_rate": 4.3e-06, + "loss": 0.0254, + "step": 86 + }, + { + "epoch": 1.1935483870967742, + "grad_norm": 0.2670588791370392, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0255, + "step": 87 + }, + { + "epoch": 1.2073732718894008, + "grad_norm": 0.30723804235458374, + "learning_rate": 4.4e-06, + "loss": 0.0263, + "step": 88 + }, + { + "epoch": 1.2211981566820276, + "grad_norm": 0.505890965461731, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0265, + "step": 89 + }, + { + "epoch": 1.2350230414746544, + "grad_norm": 0.24307991564273834, + "learning_rate": 4.5e-06, + "loss": 0.0227, + "step": 90 + }, + { + "epoch": 1.2488479262672811, + "grad_norm": 0.2198561429977417, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0261, + "step": 91 + }, + { + "epoch": 1.262672811059908, + "grad_norm": 0.2435183823108673, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0225, + "step": 92 + }, + { + "epoch": 1.2764976958525347, + "grad_norm": 0.18837811052799225, + "learning_rate": 4.65e-06, + "loss": 0.0218, + "step": 93 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 0.3818771541118622, + "learning_rate": 4.7e-06, + "loss": 0.0223, + "step": 94 + }, + { + "epoch": 1.304147465437788, + "grad_norm": 0.2358720600605011, + "learning_rate": 4.75e-06, + "loss": 0.0204, + "step": 95 + }, + { + "epoch": 1.3179723502304148, + "grad_norm": 0.25374144315719604, + "learning_rate": 4.800000000000001e-06, + "loss": 0.022, + "step": 96 + }, + { + "epoch": 1.3317972350230414, + "grad_norm": 0.36181601881980896, + "learning_rate": 4.85e-06, + "loss": 0.0244, + "step": 97 + }, + { + "epoch": 1.3456221198156681, + "grad_norm": 0.3156590759754181, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0233, + "step": 98 + }, + { + "epoch": 1.359447004608295, + "grad_norm": 0.21958638727664948, + "learning_rate": 4.95e-06, + "loss": 0.0218, + "step": 99 + }, + { + "epoch": 1.3732718894009217, + "grad_norm": 0.34455621242523193, + "learning_rate": 5e-06, + "loss": 0.0267, + "step": 100 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 0.283086359500885, + "learning_rate": 4.999888074163108e-06, + "loss": 0.0238, + "step": 101 + }, + { + "epoch": 1.400921658986175, + "grad_norm": 0.28856486082077026, + "learning_rate": 4.999552306674345e-06, + "loss": 0.0186, + "step": 102 + }, + { + "epoch": 1.4147465437788018, + "grad_norm": 0.26721692085266113, + "learning_rate": 4.998992727598557e-06, + "loss": 0.0193, + "step": 103 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.3459971249103546, + "learning_rate": 4.998209387040829e-06, + "loss": 0.0218, + "step": 104 + }, + { + "epoch": 1.4423963133640554, + "grad_norm": 0.25979122519493103, + "learning_rate": 4.9972023551419995e-06, + "loss": 0.0216, + "step": 105 + }, + { + "epoch": 1.456221198156682, + "grad_norm": 0.19960424304008484, + "learning_rate": 4.995971722072379e-06, + "loss": 0.0176, + "step": 106 + }, + { + "epoch": 1.4700460829493087, + "grad_norm": 0.2529441714286804, + "learning_rate": 4.9945175980236745e-06, + "loss": 0.0181, + "step": 107 + }, + { + "epoch": 1.4838709677419355, + "grad_norm": 0.2690267264842987, + "learning_rate": 4.992840113199131e-06, + "loss": 0.0196, + "step": 108 + }, + { + "epoch": 1.4976958525345623, + "grad_norm": 0.3516470789909363, + "learning_rate": 4.990939417801859e-06, + "loss": 0.0182, + "step": 109 + }, + { + "epoch": 1.511520737327189, + "grad_norm": 0.30167508125305176, + "learning_rate": 4.988815682021398e-06, + "loss": 0.0205, + "step": 110 + }, + { + "epoch": 1.5253456221198156, + "grad_norm": 0.3920849859714508, + "learning_rate": 4.986469096018472e-06, + "loss": 0.0177, + "step": 111 + }, + { + "epoch": 1.5391705069124424, + "grad_norm": 0.3274078369140625, + "learning_rate": 4.983899869907963e-06, + "loss": 0.0185, + "step": 112 + }, + { + "epoch": 1.5529953917050692, + "grad_norm": 0.2237282395362854, + "learning_rate": 4.981108233740096e-06, + "loss": 0.016, + "step": 113 + }, + { + "epoch": 1.5668202764976957, + "grad_norm": 0.23966379463672638, + "learning_rate": 4.978094437479843e-06, + "loss": 0.0183, + "step": 114 + }, + { + "epoch": 1.5806451612903225, + "grad_norm": 0.4027673602104187, + "learning_rate": 4.97485875098454e-06, + "loss": 0.0171, + "step": 115 + }, + { + "epoch": 1.5944700460829493, + "grad_norm": 0.24082835018634796, + "learning_rate": 4.971401463979722e-06, + "loss": 0.016, + "step": 116 + }, + { + "epoch": 1.608294930875576, + "grad_norm": 0.19387558102607727, + "learning_rate": 4.967722886033181e-06, + "loss": 0.0165, + "step": 117 + }, + { + "epoch": 1.6221198156682028, + "grad_norm": 0.33696162700653076, + "learning_rate": 4.963823346527249e-06, + "loss": 0.0154, + "step": 118 + }, + { + "epoch": 1.6359447004608296, + "grad_norm": 0.30290740728378296, + "learning_rate": 4.959703194629304e-06, + "loss": 0.0175, + "step": 119 + }, + { + "epoch": 1.6497695852534562, + "grad_norm": 0.3781787157058716, + "learning_rate": 4.955362799260507e-06, + "loss": 0.0145, + "step": 120 + }, + { + "epoch": 1.663594470046083, + "grad_norm": 0.39995357394218445, + "learning_rate": 4.950802549062764e-06, + "loss": 0.015, + "step": 121 + }, + { + "epoch": 1.6774193548387095, + "grad_norm": 0.19926570355892181, + "learning_rate": 4.946022852363932e-06, + "loss": 0.0135, + "step": 122 + }, + { + "epoch": 1.6912442396313363, + "grad_norm": 0.22450515627861023, + "learning_rate": 4.9410241371412525e-06, + "loss": 0.0135, + "step": 123 + }, + { + "epoch": 1.705069124423963, + "grad_norm": 0.3588384985923767, + "learning_rate": 4.935806850983034e-06, + "loss": 0.0125, + "step": 124 + }, + { + "epoch": 1.7188940092165899, + "grad_norm": 0.28571122884750366, + "learning_rate": 4.9303714610485705e-06, + "loss": 0.0166, + "step": 125 + }, + { + "epoch": 1.7327188940092166, + "grad_norm": 0.3496967852115631, + "learning_rate": 4.924718454026318e-06, + "loss": 0.0139, + "step": 126 + }, + { + "epoch": 1.7465437788018434, + "grad_norm": 0.3279854357242584, + "learning_rate": 4.918848336090309e-06, + "loss": 0.0133, + "step": 127 + }, + { + "epoch": 1.7603686635944702, + "grad_norm": 0.19201801717281342, + "learning_rate": 4.912761632854834e-06, + "loss": 0.0151, + "step": 128 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 0.27701929211616516, + "learning_rate": 4.906458889327375e-06, + "loss": 0.0148, + "step": 129 + }, + { + "epoch": 1.7880184331797235, + "grad_norm": 0.2757968008518219, + "learning_rate": 4.899940669859807e-06, + "loss": 0.0118, + "step": 130 + }, + { + "epoch": 1.80184331797235, + "grad_norm": 0.18373191356658936, + "learning_rate": 4.893207558097867e-06, + "loss": 0.0149, + "step": 131 + }, + { + "epoch": 1.8156682027649769, + "grad_norm": 0.2116280496120453, + "learning_rate": 4.8862601569288885e-06, + "loss": 0.0129, + "step": 132 + }, + { + "epoch": 1.8294930875576036, + "grad_norm": 0.30384117364883423, + "learning_rate": 4.879099088427824e-06, + "loss": 0.0136, + "step": 133 + }, + { + "epoch": 1.8433179723502304, + "grad_norm": 0.3766787052154541, + "learning_rate": 4.871724993801541e-06, + "loss": 0.0123, + "step": 134 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.3401263356208801, + "learning_rate": 4.864138533331411e-06, + "loss": 0.0122, + "step": 135 + }, + { + "epoch": 1.870967741935484, + "grad_norm": 0.24321958422660828, + "learning_rate": 4.8563403863141825e-06, + "loss": 0.0123, + "step": 136 + }, + { + "epoch": 1.8847926267281108, + "grad_norm": 0.16918110847473145, + "learning_rate": 4.84833125100116e-06, + "loss": 0.0104, + "step": 137 + }, + { + "epoch": 1.8986175115207373, + "grad_norm": 0.23489230871200562, + "learning_rate": 4.840111844535682e-06, + "loss": 0.0122, + "step": 138 + }, + { + "epoch": 1.912442396313364, + "grad_norm": 0.32796236872673035, + "learning_rate": 4.8316829028889076e-06, + "loss": 0.0109, + "step": 139 + }, + { + "epoch": 1.9262672811059907, + "grad_norm": 0.24210475385189056, + "learning_rate": 4.823045180793914e-06, + "loss": 0.0118, + "step": 140 + }, + { + "epoch": 1.9400921658986174, + "grad_norm": 0.3450548052787781, + "learning_rate": 4.8141994516781196e-06, + "loss": 0.0115, + "step": 141 + }, + { + "epoch": 1.9539170506912442, + "grad_norm": 0.23163923621177673, + "learning_rate": 4.805146507594034e-06, + "loss": 0.0122, + "step": 142 + }, + { + "epoch": 1.967741935483871, + "grad_norm": 0.8197745084762573, + "learning_rate": 4.7958871591483305e-06, + "loss": 0.0101, + "step": 143 + }, + { + "epoch": 1.9815668202764978, + "grad_norm": 0.2917576730251312, + "learning_rate": 4.786422235429269e-06, + "loss": 0.0078, + "step": 144 + } + ], + "logging_steps": 1, + "max_steps": 432, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 72, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.220896013978436e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-144/training_args.bin b/checkpoint-144/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..66c881ded23f97eecfbd08abb955a7188907de16 --- /dev/null +++ b/checkpoint-144/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bccf4859d2ee74fd992386ba4a70a4c4fc6d0da061af69465c1db71ce0f24882 +size 7928 diff --git a/checkpoint-144/zero_to_fp32.py b/checkpoint-144/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-144/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-216/README.md b/checkpoint-216/README.md new file mode 100644 index 0000000000000000000000000000000000000000..037e1a543b9c1891b5c6981f89d5b7c7c9a907ae --- /dev/null +++ b/checkpoint-216/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-70B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-216/adapter_config.json b/checkpoint-216/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3295690c20dff5dc1d1f30f8500f0efb7e255838 --- /dev/null +++ b/checkpoint-216/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "gate_proj", + "v_proj", + "up_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-216/adapter_model.safetensors b/checkpoint-216/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f455d2e256a2e0ba29ee80fb487824999b727661 --- /dev/null +++ b/checkpoint-216/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c471c6c3855df885543a9f76d310e831acd1e4b421c39f9d4f5a33245d75fe9 +size 10829849744 diff --git a/checkpoint-216/global_step214/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-216/global_step214/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..649455532a3b881b2778b25f7e15b3abc80a4600 --- /dev/null +++ b/checkpoint-216/global_step214/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f134ee7e4572fa6a89c4c5aa59106a4e6f0802f74fee127713bd3230b6e16f51 +size 21659418140 diff --git a/checkpoint-216/global_step214/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-216/global_step214/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..423f1d21e7de2083bdea5ddc1e50ea10920cae62 --- /dev/null +++ b/checkpoint-216/global_step214/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cba68dba63422fd9873f72ce05a19a315b81e4e4008d6d5cc1bbe3f6520d737 +size 21659457372 diff --git a/checkpoint-216/global_step214/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-216/global_step214/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0928ed80c480162df8ce76e815dbd4f8398fb70a --- /dev/null +++ b/checkpoint-216/global_step214/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c45757c3581399ca83ed0eb0df464a5206b3d02812c0f3a7e70f6f289b1d9eae +size 21659417820 diff --git a/checkpoint-216/global_step214/mp_rank_00_model_states.pt b/checkpoint-216/global_step214/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fdc55e4866977b1d9702316517fd0f375ead0de5 --- /dev/null +++ b/checkpoint-216/global_step214/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18351169e8490a497b785cac0ea1ab335e5b8bbb24478d021b7b997876568e0a +size 11918643933 diff --git a/checkpoint-216/latest b/checkpoint-216/latest new file mode 100644 index 0000000000000000000000000000000000000000..4e37c1a038b3403862a938449a34498d62500618 --- /dev/null +++ b/checkpoint-216/latest @@ -0,0 +1 @@ +global_step214 \ No newline at end of file diff --git a/checkpoint-216/rng_state_0.pth b/checkpoint-216/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ba5a90758511717b18c86dec56dec274c94d97c4 --- /dev/null +++ b/checkpoint-216/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ed596fc93f566986c2d858ceb6a24c13dfe40c2b6101df11b4cc46fd672586f +size 14768 diff --git a/checkpoint-216/rng_state_1.pth b/checkpoint-216/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a3f8036f33cbe77dd6464b3d48d8d636ba8ba27 --- /dev/null +++ b/checkpoint-216/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24cbc883095047d687cc74366529afb0b786a7226b5ea9db155182b7cc6317da +size 14768 diff --git a/checkpoint-216/rng_state_2.pth b/checkpoint-216/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..564f05e4f0b5793bc8ea053a987335304a02c0ed --- /dev/null +++ b/checkpoint-216/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2975a1b5a2dca31e958a33852c2eb51b98f38898301485c941de812ec9019925 +size 14768 diff --git a/checkpoint-216/scheduler.pt b/checkpoint-216/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5aa48f9e2224074123d70cbfd49c476a8ca21ea3 --- /dev/null +++ b/checkpoint-216/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a177484072060bf319ca9a44b8c986d20ca392d8b2158584c14221fe24d8381 +size 1064 diff --git a/checkpoint-216/special_tokens_map.json b/checkpoint-216/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-216/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-216/tokenizer.json b/checkpoint-216/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-216/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-216/tokenizer_config.json b/checkpoint-216/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-216/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-216/trainer_state.json b/checkpoint-216/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..52fc254fedd874486af6aeced96582e7af69391d --- /dev/null +++ b/checkpoint-216/trainer_state.json @@ -0,0 +1,1545 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.967741935483871, + "eval_steps": 500, + "global_step": 216, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013824884792626729, + "grad_norm": 31.00213623046875, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2089, + "step": 1 + }, + { + "epoch": 0.027649769585253458, + "grad_norm": 30.27136993408203, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.1536, + "step": 2 + }, + { + "epoch": 0.041474654377880185, + "grad_norm": 30.48703384399414, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.1581, + "step": 3 + }, + { + "epoch": 0.055299539170506916, + "grad_norm": 30.779329299926758, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.1741, + "step": 4 + }, + { + "epoch": 0.06912442396313365, + "grad_norm": 31.22808837890625, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.1864, + "step": 5 + }, + { + "epoch": 0.08294930875576037, + "grad_norm": 30.783327102661133, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.1993, + "step": 6 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 30.57423210144043, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.1506, + "step": 7 + }, + { + "epoch": 0.11059907834101383, + "grad_norm": 30.952186584472656, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.1599, + "step": 8 + }, + { + "epoch": 0.12442396313364056, + "grad_norm": 30.37245750427246, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.1572, + "step": 9 + }, + { + "epoch": 0.1382488479262673, + "grad_norm": 30.930192947387695, + "learning_rate": 5.000000000000001e-07, + "loss": 2.1447, + "step": 10 + }, + { + "epoch": 0.15207373271889402, + "grad_norm": 29.735448837280273, + "learning_rate": 5.5e-07, + "loss": 2.0742, + "step": 11 + }, + { + "epoch": 0.16589861751152074, + "grad_norm": 29.62826156616211, + "learning_rate": 6.000000000000001e-07, + "loss": 2.061, + "step": 12 + }, + { + "epoch": 0.17972350230414746, + "grad_norm": 28.937463760375977, + "learning_rate": 6.5e-07, + "loss": 1.9974, + "step": 13 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 29.24833106994629, + "learning_rate": 7.000000000000001e-07, + "loss": 1.9833, + "step": 14 + }, + { + "epoch": 0.2073732718894009, + "grad_norm": 28.122018814086914, + "learning_rate": 7.5e-07, + "loss": 1.8934, + "step": 15 + }, + { + "epoch": 0.22119815668202766, + "grad_norm": 28.059659957885742, + "learning_rate": 8.000000000000001e-07, + "loss": 1.875, + "step": 16 + }, + { + "epoch": 0.2350230414746544, + "grad_norm": 27.361961364746094, + "learning_rate": 8.500000000000001e-07, + "loss": 1.8009, + "step": 17 + }, + { + "epoch": 0.2488479262672811, + "grad_norm": 26.721765518188477, + "learning_rate": 9.000000000000001e-07, + "loss": 1.7116, + "step": 18 + }, + { + "epoch": 0.2626728110599078, + "grad_norm": 25.37330436706543, + "learning_rate": 9.500000000000001e-07, + "loss": 1.5608, + "step": 19 + }, + { + "epoch": 0.2764976958525346, + "grad_norm": 25.81206703186035, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5043, + "step": 20 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 25.539344787597656, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.3673, + "step": 21 + }, + { + "epoch": 0.30414746543778803, + "grad_norm": 25.097164154052734, + "learning_rate": 1.1e-06, + "loss": 1.2029, + "step": 22 + }, + { + "epoch": 0.31797235023041476, + "grad_norm": 24.619497299194336, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.0458, + "step": 23 + }, + { + "epoch": 0.3317972350230415, + "grad_norm": 23.820302963256836, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.8723, + "step": 24 + }, + { + "epoch": 0.3456221198156682, + "grad_norm": 23.12735939025879, + "learning_rate": 1.25e-06, + "loss": 0.7183, + "step": 25 + }, + { + "epoch": 0.35944700460829493, + "grad_norm": 20.127134323120117, + "learning_rate": 1.3e-06, + "loss": 0.5248, + "step": 26 + }, + { + "epoch": 0.37327188940092165, + "grad_norm": 15.901495933532715, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.3689, + "step": 27 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 11.053832054138184, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.2482, + "step": 28 + }, + { + "epoch": 0.4009216589861751, + "grad_norm": 7.248495578765869, + "learning_rate": 1.45e-06, + "loss": 0.1847, + "step": 29 + }, + { + "epoch": 0.4147465437788018, + "grad_norm": 5.378540515899658, + "learning_rate": 1.5e-06, + "loss": 0.1423, + "step": 30 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 3.8371808528900146, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1152, + "step": 31 + }, + { + "epoch": 0.4423963133640553, + "grad_norm": 2.2655274868011475, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0845, + "step": 32 + }, + { + "epoch": 0.45622119815668205, + "grad_norm": 1.5746861696243286, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0711, + "step": 33 + }, + { + "epoch": 0.4700460829493088, + "grad_norm": 1.3510947227478027, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0734, + "step": 34 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.9737389087677002, + "learning_rate": 1.75e-06, + "loss": 0.0651, + "step": 35 + }, + { + "epoch": 0.4976958525345622, + "grad_norm": 0.9815284609794617, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0593, + "step": 36 + }, + { + "epoch": 0.511520737327189, + "grad_norm": 0.8567912578582764, + "learning_rate": 1.85e-06, + "loss": 0.0543, + "step": 37 + }, + { + "epoch": 0.5253456221198156, + "grad_norm": 0.6773302555084229, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0622, + "step": 38 + }, + { + "epoch": 0.5391705069124424, + "grad_norm": 0.49936285614967346, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0511, + "step": 39 + }, + { + "epoch": 0.5529953917050692, + "grad_norm": 0.6253588795661926, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0478, + "step": 40 + }, + { + "epoch": 0.5668202764976958, + "grad_norm": 0.5103089809417725, + "learning_rate": 2.05e-06, + "loss": 0.0465, + "step": 41 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.29294702410697937, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0456, + "step": 42 + }, + { + "epoch": 0.5944700460829493, + "grad_norm": 0.4237954616546631, + "learning_rate": 2.15e-06, + "loss": 0.0501, + "step": 43 + }, + { + "epoch": 0.6082949308755761, + "grad_norm": 0.42243412137031555, + "learning_rate": 2.2e-06, + "loss": 0.0388, + "step": 44 + }, + { + "epoch": 0.6221198156682027, + "grad_norm": 0.37881818413734436, + "learning_rate": 2.25e-06, + "loss": 0.0415, + "step": 45 + }, + { + "epoch": 0.6359447004608295, + "grad_norm": 0.4941152036190033, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.045, + "step": 46 + }, + { + "epoch": 0.6497695852534562, + "grad_norm": 0.3046450912952423, + "learning_rate": 2.35e-06, + "loss": 0.0386, + "step": 47 + }, + { + "epoch": 0.663594470046083, + "grad_norm": 0.39361852407455444, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0447, + "step": 48 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.5190001130104065, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0364, + "step": 49 + }, + { + "epoch": 0.6912442396313364, + "grad_norm": 0.372072696685791, + "learning_rate": 2.5e-06, + "loss": 0.043, + "step": 50 + }, + { + "epoch": 0.7050691244239631, + "grad_norm": 0.3756551146507263, + "learning_rate": 2.55e-06, + "loss": 0.0424, + "step": 51 + }, + { + "epoch": 0.7188940092165899, + "grad_norm": 0.4593554437160492, + "learning_rate": 2.6e-06, + "loss": 0.0387, + "step": 52 + }, + { + "epoch": 0.7327188940092166, + "grad_norm": 0.2931855618953705, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0396, + "step": 53 + }, + { + "epoch": 0.7465437788018433, + "grad_norm": 0.38429534435272217, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0373, + "step": 54 + }, + { + "epoch": 0.7603686635944701, + "grad_norm": 0.3506857752799988, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.04, + "step": 55 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.29847028851509094, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0369, + "step": 56 + }, + { + "epoch": 0.7880184331797235, + "grad_norm": 0.3653375506401062, + "learning_rate": 2.85e-06, + "loss": 0.0396, + "step": 57 + }, + { + "epoch": 0.8018433179723502, + "grad_norm": 0.3163083791732788, + "learning_rate": 2.9e-06, + "loss": 0.0337, + "step": 58 + }, + { + "epoch": 0.815668202764977, + "grad_norm": 0.3734363615512848, + "learning_rate": 2.95e-06, + "loss": 0.0327, + "step": 59 + }, + { + "epoch": 0.8294930875576036, + "grad_norm": 0.29547712206840515, + "learning_rate": 3e-06, + "loss": 0.0365, + "step": 60 + }, + { + "epoch": 0.8433179723502304, + "grad_norm": 0.4041007161140442, + "learning_rate": 3.05e-06, + "loss": 0.038, + "step": 61 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.3602149784564972, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.033, + "step": 62 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 0.2948857545852661, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0386, + "step": 63 + }, + { + "epoch": 0.8847926267281107, + "grad_norm": 0.39098358154296875, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0323, + "step": 64 + }, + { + "epoch": 0.8986175115207373, + "grad_norm": 0.3692062795162201, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0309, + "step": 65 + }, + { + "epoch": 0.9124423963133641, + "grad_norm": 0.3967229425907135, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0346, + "step": 66 + }, + { + "epoch": 0.9262672811059908, + "grad_norm": 0.47776708006858826, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0355, + "step": 67 + }, + { + "epoch": 0.9400921658986175, + "grad_norm": 0.21545131504535675, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0294, + "step": 68 + }, + { + "epoch": 0.9539170506912442, + "grad_norm": 0.23738539218902588, + "learning_rate": 3.45e-06, + "loss": 0.0308, + "step": 69 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.29174014925956726, + "learning_rate": 3.5e-06, + "loss": 0.0312, + "step": 70 + }, + { + "epoch": 0.9815668202764977, + "grad_norm": 0.38475602865219116, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0324, + "step": 71 + }, + { + "epoch": 0.9953917050691244, + "grad_norm": 0.4077378809452057, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0297, + "step": 72 + }, + { + "epoch": 1.0, + "grad_norm": 0.4077378809452057, + "learning_rate": 3.65e-06, + "loss": 0.031, + "step": 73 + }, + { + "epoch": 1.0138248847926268, + "grad_norm": 0.46581539511680603, + "learning_rate": 3.7e-06, + "loss": 0.0313, + "step": 74 + }, + { + "epoch": 1.0276497695852536, + "grad_norm": 0.24417200684547424, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.027, + "step": 75 + }, + { + "epoch": 1.0414746543778801, + "grad_norm": 0.20425117015838623, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0307, + "step": 76 + }, + { + "epoch": 1.055299539170507, + "grad_norm": 0.3578161597251892, + "learning_rate": 3.85e-06, + "loss": 0.0312, + "step": 77 + }, + { + "epoch": 1.0691244239631337, + "grad_norm": 0.39486679434776306, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0294, + "step": 78 + }, + { + "epoch": 1.0829493087557605, + "grad_norm": 0.3932795226573944, + "learning_rate": 3.95e-06, + "loss": 0.0307, + "step": 79 + }, + { + "epoch": 1.096774193548387, + "grad_norm": 0.2946235239505768, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0257, + "step": 80 + }, + { + "epoch": 1.1105990783410138, + "grad_norm": 0.3318672776222229, + "learning_rate": 4.05e-06, + "loss": 0.0296, + "step": 81 + }, + { + "epoch": 1.1244239631336406, + "grad_norm": 0.23701588809490204, + "learning_rate": 4.1e-06, + "loss": 0.0298, + "step": 82 + }, + { + "epoch": 1.1382488479262673, + "grad_norm": 0.2415941059589386, + "learning_rate": 4.15e-06, + "loss": 0.0256, + "step": 83 + }, + { + "epoch": 1.1520737327188941, + "grad_norm": 0.24098087847232819, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0263, + "step": 84 + }, + { + "epoch": 1.1658986175115207, + "grad_norm": 0.3530103862285614, + "learning_rate": 4.25e-06, + "loss": 0.0308, + "step": 85 + }, + { + "epoch": 1.1797235023041475, + "grad_norm": 0.2382838875055313, + "learning_rate": 4.3e-06, + "loss": 0.0254, + "step": 86 + }, + { + "epoch": 1.1935483870967742, + "grad_norm": 0.2670588791370392, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0255, + "step": 87 + }, + { + "epoch": 1.2073732718894008, + "grad_norm": 0.30723804235458374, + "learning_rate": 4.4e-06, + "loss": 0.0263, + "step": 88 + }, + { + "epoch": 1.2211981566820276, + "grad_norm": 0.505890965461731, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0265, + "step": 89 + }, + { + "epoch": 1.2350230414746544, + "grad_norm": 0.24307991564273834, + "learning_rate": 4.5e-06, + "loss": 0.0227, + "step": 90 + }, + { + "epoch": 1.2488479262672811, + "grad_norm": 0.2198561429977417, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0261, + "step": 91 + }, + { + "epoch": 1.262672811059908, + "grad_norm": 0.2435183823108673, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0225, + "step": 92 + }, + { + "epoch": 1.2764976958525347, + "grad_norm": 0.18837811052799225, + "learning_rate": 4.65e-06, + "loss": 0.0218, + "step": 93 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 0.3818771541118622, + "learning_rate": 4.7e-06, + "loss": 0.0223, + "step": 94 + }, + { + "epoch": 1.304147465437788, + "grad_norm": 0.2358720600605011, + "learning_rate": 4.75e-06, + "loss": 0.0204, + "step": 95 + }, + { + "epoch": 1.3179723502304148, + "grad_norm": 0.25374144315719604, + "learning_rate": 4.800000000000001e-06, + "loss": 0.022, + "step": 96 + }, + { + "epoch": 1.3317972350230414, + "grad_norm": 0.36181601881980896, + "learning_rate": 4.85e-06, + "loss": 0.0244, + "step": 97 + }, + { + "epoch": 1.3456221198156681, + "grad_norm": 0.3156590759754181, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0233, + "step": 98 + }, + { + "epoch": 1.359447004608295, + "grad_norm": 0.21958638727664948, + "learning_rate": 4.95e-06, + "loss": 0.0218, + "step": 99 + }, + { + "epoch": 1.3732718894009217, + "grad_norm": 0.34455621242523193, + "learning_rate": 5e-06, + "loss": 0.0267, + "step": 100 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 0.283086359500885, + "learning_rate": 4.999888074163108e-06, + "loss": 0.0238, + "step": 101 + }, + { + "epoch": 1.400921658986175, + "grad_norm": 0.28856486082077026, + "learning_rate": 4.999552306674345e-06, + "loss": 0.0186, + "step": 102 + }, + { + "epoch": 1.4147465437788018, + "grad_norm": 0.26721692085266113, + "learning_rate": 4.998992727598557e-06, + "loss": 0.0193, + "step": 103 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.3459971249103546, + "learning_rate": 4.998209387040829e-06, + "loss": 0.0218, + "step": 104 + }, + { + "epoch": 1.4423963133640554, + "grad_norm": 0.25979122519493103, + "learning_rate": 4.9972023551419995e-06, + "loss": 0.0216, + "step": 105 + }, + { + "epoch": 1.456221198156682, + "grad_norm": 0.19960424304008484, + "learning_rate": 4.995971722072379e-06, + "loss": 0.0176, + "step": 106 + }, + { + "epoch": 1.4700460829493087, + "grad_norm": 0.2529441714286804, + "learning_rate": 4.9945175980236745e-06, + "loss": 0.0181, + "step": 107 + }, + { + "epoch": 1.4838709677419355, + "grad_norm": 0.2690267264842987, + "learning_rate": 4.992840113199131e-06, + "loss": 0.0196, + "step": 108 + }, + { + "epoch": 1.4976958525345623, + "grad_norm": 0.3516470789909363, + "learning_rate": 4.990939417801859e-06, + "loss": 0.0182, + "step": 109 + }, + { + "epoch": 1.511520737327189, + "grad_norm": 0.30167508125305176, + "learning_rate": 4.988815682021398e-06, + "loss": 0.0205, + "step": 110 + }, + { + "epoch": 1.5253456221198156, + "grad_norm": 0.3920849859714508, + "learning_rate": 4.986469096018472e-06, + "loss": 0.0177, + "step": 111 + }, + { + "epoch": 1.5391705069124424, + "grad_norm": 0.3274078369140625, + "learning_rate": 4.983899869907963e-06, + "loss": 0.0185, + "step": 112 + }, + { + "epoch": 1.5529953917050692, + "grad_norm": 0.2237282395362854, + "learning_rate": 4.981108233740096e-06, + "loss": 0.016, + "step": 113 + }, + { + "epoch": 1.5668202764976957, + "grad_norm": 0.23966379463672638, + "learning_rate": 4.978094437479843e-06, + "loss": 0.0183, + "step": 114 + }, + { + "epoch": 1.5806451612903225, + "grad_norm": 0.4027673602104187, + "learning_rate": 4.97485875098454e-06, + "loss": 0.0171, + "step": 115 + }, + { + "epoch": 1.5944700460829493, + "grad_norm": 0.24082835018634796, + "learning_rate": 4.971401463979722e-06, + "loss": 0.016, + "step": 116 + }, + { + "epoch": 1.608294930875576, + "grad_norm": 0.19387558102607727, + "learning_rate": 4.967722886033181e-06, + "loss": 0.0165, + "step": 117 + }, + { + "epoch": 1.6221198156682028, + "grad_norm": 0.33696162700653076, + "learning_rate": 4.963823346527249e-06, + "loss": 0.0154, + "step": 118 + }, + { + "epoch": 1.6359447004608296, + "grad_norm": 0.30290740728378296, + "learning_rate": 4.959703194629304e-06, + "loss": 0.0175, + "step": 119 + }, + { + "epoch": 1.6497695852534562, + "grad_norm": 0.3781787157058716, + "learning_rate": 4.955362799260507e-06, + "loss": 0.0145, + "step": 120 + }, + { + "epoch": 1.663594470046083, + "grad_norm": 0.39995357394218445, + "learning_rate": 4.950802549062764e-06, + "loss": 0.015, + "step": 121 + }, + { + "epoch": 1.6774193548387095, + "grad_norm": 0.19926570355892181, + "learning_rate": 4.946022852363932e-06, + "loss": 0.0135, + "step": 122 + }, + { + "epoch": 1.6912442396313363, + "grad_norm": 0.22450515627861023, + "learning_rate": 4.9410241371412525e-06, + "loss": 0.0135, + "step": 123 + }, + { + "epoch": 1.705069124423963, + "grad_norm": 0.3588384985923767, + "learning_rate": 4.935806850983034e-06, + "loss": 0.0125, + "step": 124 + }, + { + "epoch": 1.7188940092165899, + "grad_norm": 0.28571122884750366, + "learning_rate": 4.9303714610485705e-06, + "loss": 0.0166, + "step": 125 + }, + { + "epoch": 1.7327188940092166, + "grad_norm": 0.3496967852115631, + "learning_rate": 4.924718454026318e-06, + "loss": 0.0139, + "step": 126 + }, + { + "epoch": 1.7465437788018434, + "grad_norm": 0.3279854357242584, + "learning_rate": 4.918848336090309e-06, + "loss": 0.0133, + "step": 127 + }, + { + "epoch": 1.7603686635944702, + "grad_norm": 0.19201801717281342, + "learning_rate": 4.912761632854834e-06, + "loss": 0.0151, + "step": 128 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 0.27701929211616516, + "learning_rate": 4.906458889327375e-06, + "loss": 0.0148, + "step": 129 + }, + { + "epoch": 1.7880184331797235, + "grad_norm": 0.2757968008518219, + "learning_rate": 4.899940669859807e-06, + "loss": 0.0118, + "step": 130 + }, + { + "epoch": 1.80184331797235, + "grad_norm": 0.18373191356658936, + "learning_rate": 4.893207558097867e-06, + "loss": 0.0149, + "step": 131 + }, + { + "epoch": 1.8156682027649769, + "grad_norm": 0.2116280496120453, + "learning_rate": 4.8862601569288885e-06, + "loss": 0.0129, + "step": 132 + }, + { + "epoch": 1.8294930875576036, + "grad_norm": 0.30384117364883423, + "learning_rate": 4.879099088427824e-06, + "loss": 0.0136, + "step": 133 + }, + { + "epoch": 1.8433179723502304, + "grad_norm": 0.3766787052154541, + "learning_rate": 4.871724993801541e-06, + "loss": 0.0123, + "step": 134 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.3401263356208801, + "learning_rate": 4.864138533331411e-06, + "loss": 0.0122, + "step": 135 + }, + { + "epoch": 1.870967741935484, + "grad_norm": 0.24321958422660828, + "learning_rate": 4.8563403863141825e-06, + "loss": 0.0123, + "step": 136 + }, + { + "epoch": 1.8847926267281108, + "grad_norm": 0.16918110847473145, + "learning_rate": 4.84833125100116e-06, + "loss": 0.0104, + "step": 137 + }, + { + "epoch": 1.8986175115207373, + "grad_norm": 0.23489230871200562, + "learning_rate": 4.840111844535682e-06, + "loss": 0.0122, + "step": 138 + }, + { + "epoch": 1.912442396313364, + "grad_norm": 0.32796236872673035, + "learning_rate": 4.8316829028889076e-06, + "loss": 0.0109, + "step": 139 + }, + { + "epoch": 1.9262672811059907, + "grad_norm": 0.24210475385189056, + "learning_rate": 4.823045180793914e-06, + "loss": 0.0118, + "step": 140 + }, + { + "epoch": 1.9400921658986174, + "grad_norm": 0.3450548052787781, + "learning_rate": 4.8141994516781196e-06, + "loss": 0.0115, + "step": 141 + }, + { + "epoch": 1.9539170506912442, + "grad_norm": 0.23163923621177673, + "learning_rate": 4.805146507594034e-06, + "loss": 0.0122, + "step": 142 + }, + { + "epoch": 1.967741935483871, + "grad_norm": 0.8197745084762573, + "learning_rate": 4.7958871591483305e-06, + "loss": 0.0101, + "step": 143 + }, + { + "epoch": 1.9815668202764978, + "grad_norm": 0.2917576730251312, + "learning_rate": 4.786422235429269e-06, + "loss": 0.0078, + "step": 144 + }, + { + "epoch": 1.9953917050691246, + "grad_norm": 0.24417108297348022, + "learning_rate": 4.776752583932455e-06, + "loss": 0.0119, + "step": 145 + }, + { + "epoch": 2.0, + "grad_norm": 0.24417108297348022, + "learning_rate": 4.766879070484957e-06, + "loss": 0.0089, + "step": 146 + }, + { + "epoch": 2.013824884792627, + "grad_norm": 0.4215025305747986, + "learning_rate": 4.756802579167781e-06, + "loss": 0.007, + "step": 147 + }, + { + "epoch": 2.0276497695852536, + "grad_norm": 0.2002098709344864, + "learning_rate": 4.746524012236706e-06, + "loss": 0.0078, + "step": 148 + }, + { + "epoch": 2.0414746543778803, + "grad_norm": 0.16432569921016693, + "learning_rate": 4.736044290041496e-06, + "loss": 0.0074, + "step": 149 + }, + { + "epoch": 2.055299539170507, + "grad_norm": 0.2516174018383026, + "learning_rate": 4.725364350943492e-06, + "loss": 0.0067, + "step": 150 + }, + { + "epoch": 2.0691244239631335, + "grad_norm": 0.24242427945137024, + "learning_rate": 4.714485151231593e-06, + "loss": 0.0083, + "step": 151 + }, + { + "epoch": 2.0829493087557602, + "grad_norm": 0.22929197549819946, + "learning_rate": 4.703407665036622e-06, + "loss": 0.0061, + "step": 152 + }, + { + "epoch": 2.096774193548387, + "grad_norm": 0.2929408550262451, + "learning_rate": 4.692132884244113e-06, + "loss": 0.0064, + "step": 153 + }, + { + "epoch": 2.110599078341014, + "grad_norm": 0.22497303783893585, + "learning_rate": 4.680661818405485e-06, + "loss": 0.0061, + "step": 154 + }, + { + "epoch": 2.1244239631336406, + "grad_norm": 0.13698536157608032, + "learning_rate": 4.668995494647653e-06, + "loss": 0.0059, + "step": 155 + }, + { + "epoch": 2.1382488479262673, + "grad_norm": 0.32037150859832764, + "learning_rate": 4.657134957581057e-06, + "loss": 0.0067, + "step": 156 + }, + { + "epoch": 2.152073732718894, + "grad_norm": 0.19389067590236664, + "learning_rate": 4.645081269206128e-06, + "loss": 0.0062, + "step": 157 + }, + { + "epoch": 2.165898617511521, + "grad_norm": 0.2791127562522888, + "learning_rate": 4.632835508818192e-06, + "loss": 0.0058, + "step": 158 + }, + { + "epoch": 2.1797235023041477, + "grad_norm": 0.2178739458322525, + "learning_rate": 4.620398772910833e-06, + "loss": 0.0056, + "step": 159 + }, + { + "epoch": 2.193548387096774, + "grad_norm": 0.29685622453689575, + "learning_rate": 4.607772175077712e-06, + "loss": 0.0055, + "step": 160 + }, + { + "epoch": 2.207373271889401, + "grad_norm": 0.6792906522750854, + "learning_rate": 4.59495684591285e-06, + "loss": 0.0057, + "step": 161 + }, + { + "epoch": 2.2211981566820276, + "grad_norm": 0.17910148203372955, + "learning_rate": 4.581953932909403e-06, + "loss": 0.0046, + "step": 162 + }, + { + "epoch": 2.2350230414746544, + "grad_norm": 0.12593543529510498, + "learning_rate": 4.5687646003569055e-06, + "loss": 0.0046, + "step": 163 + }, + { + "epoch": 2.248847926267281, + "grad_norm": 0.15383680164813995, + "learning_rate": 4.555390029237026e-06, + "loss": 0.0059, + "step": 164 + }, + { + "epoch": 2.262672811059908, + "grad_norm": 0.2324540764093399, + "learning_rate": 4.541831417117815e-06, + "loss": 0.0067, + "step": 165 + }, + { + "epoch": 2.2764976958525347, + "grad_norm": 0.21278905868530273, + "learning_rate": 4.528089978046481e-06, + "loss": 0.0054, + "step": 166 + }, + { + "epoch": 2.2903225806451615, + "grad_norm": 0.2499057948589325, + "learning_rate": 4.514166942440679e-06, + "loss": 0.003, + "step": 167 + }, + { + "epoch": 2.3041474654377883, + "grad_norm": 0.1734611839056015, + "learning_rate": 4.5000635569783365e-06, + "loss": 0.0043, + "step": 168 + }, + { + "epoch": 2.3179723502304146, + "grad_norm": 0.17815802991390228, + "learning_rate": 4.4857810844860325e-06, + "loss": 0.0048, + "step": 169 + }, + { + "epoch": 2.3317972350230414, + "grad_norm": 0.22731409966945648, + "learning_rate": 4.471320803825915e-06, + "loss": 0.0034, + "step": 170 + }, + { + "epoch": 2.345622119815668, + "grad_norm": 0.23811140656471252, + "learning_rate": 4.4566840097811956e-06, + "loss": 0.0029, + "step": 171 + }, + { + "epoch": 2.359447004608295, + "grad_norm": 0.17744024097919464, + "learning_rate": 4.4418720129402145e-06, + "loss": 0.0029, + "step": 172 + }, + { + "epoch": 2.3732718894009217, + "grad_norm": 0.24912229180335999, + "learning_rate": 4.426886139579083e-06, + "loss": 0.0049, + "step": 173 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 0.17039696872234344, + "learning_rate": 4.411727731542937e-06, + "loss": 0.003, + "step": 174 + }, + { + "epoch": 2.4009216589861753, + "grad_norm": 0.3089725375175476, + "learning_rate": 4.39639814612578e-06, + "loss": 0.0034, + "step": 175 + }, + { + "epoch": 2.4147465437788016, + "grad_norm": 0.22647598385810852, + "learning_rate": 4.3808987559489536e-06, + "loss": 0.0058, + "step": 176 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.19015835225582123, + "learning_rate": 4.365230948838232e-06, + "loss": 0.004, + "step": 177 + }, + { + "epoch": 2.442396313364055, + "grad_norm": 0.1825973391532898, + "learning_rate": 4.349396127699552e-06, + "loss": 0.0032, + "step": 178 + }, + { + "epoch": 2.456221198156682, + "grad_norm": 0.15705449879169464, + "learning_rate": 4.3333957103934025e-06, + "loss": 0.0035, + "step": 179 + }, + { + "epoch": 2.4700460829493087, + "grad_norm": 0.19110225141048431, + "learning_rate": 4.317231129607859e-06, + "loss": 0.0019, + "step": 180 + }, + { + "epoch": 2.4838709677419355, + "grad_norm": 0.1481270045042038, + "learning_rate": 4.30090383273031e-06, + "loss": 0.0035, + "step": 181 + }, + { + "epoch": 2.4976958525345623, + "grad_norm": 0.19533571600914001, + "learning_rate": 4.2844152817178476e-06, + "loss": 0.0023, + "step": 182 + }, + { + "epoch": 2.511520737327189, + "grad_norm": 0.1991293579339981, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0025, + "step": 183 + }, + { + "epoch": 2.525345622119816, + "grad_norm": 0.22637878358364105, + "learning_rate": 4.2509603371783776e-06, + "loss": 0.0029, + "step": 184 + }, + { + "epoch": 2.539170506912442, + "grad_norm": 0.21984712779521942, + "learning_rate": 4.233996939229502e-06, + "loss": 0.0035, + "step": 185 + }, + { + "epoch": 2.5529953917050694, + "grad_norm": 0.25706061720848083, + "learning_rate": 4.216878278033753e-06, + "loss": 0.0033, + "step": 186 + }, + { + "epoch": 2.5668202764976957, + "grad_norm": 0.224118173122406, + "learning_rate": 4.199605886407515e-06, + "loss": 0.0017, + "step": 187 + }, + { + "epoch": 2.5806451612903225, + "grad_norm": 0.0781751424074173, + "learning_rate": 4.1821813109322975e-06, + "loss": 0.002, + "step": 188 + }, + { + "epoch": 2.5944700460829493, + "grad_norm": 0.2209765911102295, + "learning_rate": 4.164606111816256e-06, + "loss": 0.0018, + "step": 189 + }, + { + "epoch": 2.608294930875576, + "grad_norm": 0.12815824151039124, + "learning_rate": 4.146881862754485e-06, + "loss": 0.003, + "step": 190 + }, + { + "epoch": 2.622119815668203, + "grad_norm": 0.3006991147994995, + "learning_rate": 4.129010150788112e-06, + "loss": 0.0022, + "step": 191 + }, + { + "epoch": 2.6359447004608296, + "grad_norm": 0.19085584580898285, + "learning_rate": 4.110992576162193e-06, + "loss": 0.0026, + "step": 192 + }, + { + "epoch": 2.6497695852534564, + "grad_norm": 0.13027659058570862, + "learning_rate": 4.092830752182423e-06, + "loss": 0.0015, + "step": 193 + }, + { + "epoch": 2.6635944700460827, + "grad_norm": 0.16998590528964996, + "learning_rate": 4.074526305070679e-06, + "loss": 0.0018, + "step": 194 + }, + { + "epoch": 2.6774193548387095, + "grad_norm": 0.1743537187576294, + "learning_rate": 4.056080873819412e-06, + "loss": 0.0022, + "step": 195 + }, + { + "epoch": 2.6912442396313363, + "grad_norm": 0.3566405177116394, + "learning_rate": 4.037496110044885e-06, + "loss": 0.0018, + "step": 196 + }, + { + "epoch": 2.705069124423963, + "grad_norm": 0.274739146232605, + "learning_rate": 4.018773677839289e-06, + "loss": 0.0012, + "step": 197 + }, + { + "epoch": 2.71889400921659, + "grad_norm": 0.12038746476173401, + "learning_rate": 3.999915253621739e-06, + "loss": 0.0013, + "step": 198 + }, + { + "epoch": 2.7327188940092166, + "grad_norm": 0.12693172693252563, + "learning_rate": 3.980922525988167e-06, + "loss": 0.0017, + "step": 199 + }, + { + "epoch": 2.7465437788018434, + "grad_norm": 0.11907753348350525, + "learning_rate": 3.961797195560118e-06, + "loss": 0.001, + "step": 200 + }, + { + "epoch": 2.76036866359447, + "grad_norm": 0.1901165395975113, + "learning_rate": 3.942540974832486e-06, + "loss": 0.0028, + "step": 201 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 0.2039843052625656, + "learning_rate": 3.9231555880201655e-06, + "loss": 0.0011, + "step": 202 + }, + { + "epoch": 2.7880184331797233, + "grad_norm": 0.16181506216526031, + "learning_rate": 3.903642770903671e-06, + "loss": 0.003, + "step": 203 + }, + { + "epoch": 2.80184331797235, + "grad_norm": 0.13345211744308472, + "learning_rate": 3.884004270673711e-06, + "loss": 0.0023, + "step": 204 + }, + { + "epoch": 2.815668202764977, + "grad_norm": 0.19453725218772888, + "learning_rate": 3.864241845774746e-06, + "loss": 0.001, + "step": 205 + }, + { + "epoch": 2.8294930875576036, + "grad_norm": 0.18157535791397095, + "learning_rate": 3.844357265747531e-06, + "loss": 0.0029, + "step": 206 + }, + { + "epoch": 2.8433179723502304, + "grad_norm": 0.17876467108726501, + "learning_rate": 3.8243523110706736e-06, + "loss": 0.0018, + "step": 207 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.13000421226024628, + "learning_rate": 3.8042287730012117e-06, + "loss": 0.0011, + "step": 208 + }, + { + "epoch": 2.870967741935484, + "grad_norm": 0.08808371424674988, + "learning_rate": 3.7839884534142157e-06, + "loss": 0.0018, + "step": 209 + }, + { + "epoch": 2.8847926267281108, + "grad_norm": 0.32318148016929626, + "learning_rate": 3.7636331646414524e-06, + "loss": 0.0012, + "step": 210 + }, + { + "epoch": 2.8986175115207375, + "grad_norm": 0.1259954422712326, + "learning_rate": 3.7431647293091076e-06, + "loss": 0.0012, + "step": 211 + }, + { + "epoch": 2.912442396313364, + "grad_norm": 0.1344563215970993, + "learning_rate": 3.7225849801745835e-06, + "loss": 0.0006, + "step": 212 + }, + { + "epoch": 2.9262672811059907, + "grad_norm": 0.09105626493692398, + "learning_rate": 3.701895759962397e-06, + "loss": 0.0007, + "step": 213 + }, + { + "epoch": 2.9400921658986174, + "grad_norm": 0.11718853563070297, + "learning_rate": 3.6810989211991777e-06, + "loss": 0.0022, + "step": 214 + }, + { + "epoch": 2.953917050691244, + "grad_norm": 0.10988112539052963, + "learning_rate": 3.6601963260477923e-06, + "loss": 0.0007, + "step": 215 + }, + { + "epoch": 2.967741935483871, + "grad_norm": 0.12010538578033447, + "learning_rate": 3.6391898461406045e-06, + "loss": 0.0014, + "step": 216 + } + ], + "logging_steps": 1, + "max_steps": 432, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 72, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3809900078187414e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-216/training_args.bin b/checkpoint-216/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..66c881ded23f97eecfbd08abb955a7188907de16 --- /dev/null +++ b/checkpoint-216/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bccf4859d2ee74fd992386ba4a70a4c4fc6d0da061af69465c1db71ce0f24882 +size 7928 diff --git a/checkpoint-216/zero_to_fp32.py b/checkpoint-216/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-216/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-288/README.md b/checkpoint-288/README.md new file mode 100644 index 0000000000000000000000000000000000000000..037e1a543b9c1891b5c6981f89d5b7c7c9a907ae --- /dev/null +++ b/checkpoint-288/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-70B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-288/adapter_config.json b/checkpoint-288/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3295690c20dff5dc1d1f30f8500f0efb7e255838 --- /dev/null +++ b/checkpoint-288/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "gate_proj", + "v_proj", + "up_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-288/adapter_model.safetensors b/checkpoint-288/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fc4c2e8a66739c6d3361fcdffd67a9031e168499 --- /dev/null +++ b/checkpoint-288/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c8348f2492629e673ecc2872f32d21418cc80e9a94d17b73c602a861679bfa9 +size 10829849744 diff --git a/checkpoint-288/global_step286/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-288/global_step286/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1eb83ac2303fd4b3ac04a5453d08bba310101b6 --- /dev/null +++ b/checkpoint-288/global_step286/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43bace605354301797a7d7fb1939cab063ab57a42ef3f0ad40a90f368f311af8 +size 21659418140 diff --git a/checkpoint-288/global_step286/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-288/global_step286/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c90864154768f412f669d7e15a11286781f37920 --- /dev/null +++ b/checkpoint-288/global_step286/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbc5e702f698f4beb2d2241fbcebf13e8f59c8c9503fc94e4005e37d9696072c +size 21659457372 diff --git a/checkpoint-288/global_step286/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-288/global_step286/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c460a40495a3a4eea54a61d638333bebaf6f06c4 --- /dev/null +++ b/checkpoint-288/global_step286/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5071be11a0037e80eea6c6a9533643637e7cc32fa3839ca95a88661fc02ac6ab +size 21659417820 diff --git a/checkpoint-288/global_step286/mp_rank_00_model_states.pt b/checkpoint-288/global_step286/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..80af4fc89afbe003a5905c1017306e1d859ab4f8 --- /dev/null +++ b/checkpoint-288/global_step286/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c62ca72cd7c7caffe5e9b3455ce48eb0802204b13b9e4a6b56453f0b85ec6c36 +size 11918643933 diff --git a/checkpoint-288/latest b/checkpoint-288/latest new file mode 100644 index 0000000000000000000000000000000000000000..d39b7b89e4c7ece066f462dc46df67da65d1810e --- /dev/null +++ b/checkpoint-288/latest @@ -0,0 +1 @@ +global_step286 \ No newline at end of file diff --git a/checkpoint-288/rng_state_0.pth b/checkpoint-288/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..51096e4dafd3a8bfeb6752f6ae421bb3e6fb942b --- /dev/null +++ b/checkpoint-288/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c97e3335cd6fbb20b76a202cb002eb217c1982b611bc1a714282da4176c8f5c +size 14768 diff --git a/checkpoint-288/rng_state_1.pth b/checkpoint-288/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..44511aabee726de376a223133282d6b368dbef19 --- /dev/null +++ b/checkpoint-288/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c91130d325023d767cdd46255e1fd9b3e83d624f256c07f9c9c131849abfdec3 +size 14768 diff --git a/checkpoint-288/rng_state_2.pth b/checkpoint-288/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..5dbeee29f2f0eca8b7f1789b62e96bd1de8e1772 --- /dev/null +++ b/checkpoint-288/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b246bd578c3e3ae237e25a547bf3d85d3c85f96c12dfade914140855e2d1bec0 +size 14768 diff --git a/checkpoint-288/scheduler.pt b/checkpoint-288/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7628c0024597416a8211ffbd5a68418d83a678eb --- /dev/null +++ b/checkpoint-288/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b69b4d3a079847fc3286aa16b458c66149593b0a314e964b178b2d2904a96b7b +size 1064 diff --git a/checkpoint-288/special_tokens_map.json b/checkpoint-288/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-288/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-288/tokenizer.json b/checkpoint-288/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-288/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-288/tokenizer_config.json b/checkpoint-288/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-288/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-288/trainer_state.json b/checkpoint-288/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4f3a432bb87c136bc4132f8baa23faceaf0e2f --- /dev/null +++ b/checkpoint-288/trainer_state.json @@ -0,0 +1,2049 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.953917050691244, + "eval_steps": 500, + "global_step": 288, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013824884792626729, + "grad_norm": 31.00213623046875, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2089, + "step": 1 + }, + { + "epoch": 0.027649769585253458, + "grad_norm": 30.27136993408203, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.1536, + "step": 2 + }, + { + "epoch": 0.041474654377880185, + "grad_norm": 30.48703384399414, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.1581, + "step": 3 + }, + { + "epoch": 0.055299539170506916, + "grad_norm": 30.779329299926758, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.1741, + "step": 4 + }, + { + "epoch": 0.06912442396313365, + "grad_norm": 31.22808837890625, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.1864, + "step": 5 + }, + { + "epoch": 0.08294930875576037, + "grad_norm": 30.783327102661133, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.1993, + "step": 6 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 30.57423210144043, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.1506, + "step": 7 + }, + { + "epoch": 0.11059907834101383, + "grad_norm": 30.952186584472656, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.1599, + "step": 8 + }, + { + "epoch": 0.12442396313364056, + "grad_norm": 30.37245750427246, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.1572, + "step": 9 + }, + { + "epoch": 0.1382488479262673, + "grad_norm": 30.930192947387695, + "learning_rate": 5.000000000000001e-07, + "loss": 2.1447, + "step": 10 + }, + { + "epoch": 0.15207373271889402, + "grad_norm": 29.735448837280273, + "learning_rate": 5.5e-07, + "loss": 2.0742, + "step": 11 + }, + { + "epoch": 0.16589861751152074, + "grad_norm": 29.62826156616211, + "learning_rate": 6.000000000000001e-07, + "loss": 2.061, + "step": 12 + }, + { + "epoch": 0.17972350230414746, + "grad_norm": 28.937463760375977, + "learning_rate": 6.5e-07, + "loss": 1.9974, + "step": 13 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 29.24833106994629, + "learning_rate": 7.000000000000001e-07, + "loss": 1.9833, + "step": 14 + }, + { + "epoch": 0.2073732718894009, + "grad_norm": 28.122018814086914, + "learning_rate": 7.5e-07, + "loss": 1.8934, + "step": 15 + }, + { + "epoch": 0.22119815668202766, + "grad_norm": 28.059659957885742, + "learning_rate": 8.000000000000001e-07, + "loss": 1.875, + "step": 16 + }, + { + "epoch": 0.2350230414746544, + "grad_norm": 27.361961364746094, + "learning_rate": 8.500000000000001e-07, + "loss": 1.8009, + "step": 17 + }, + { + "epoch": 0.2488479262672811, + "grad_norm": 26.721765518188477, + "learning_rate": 9.000000000000001e-07, + "loss": 1.7116, + "step": 18 + }, + { + "epoch": 0.2626728110599078, + "grad_norm": 25.37330436706543, + "learning_rate": 9.500000000000001e-07, + "loss": 1.5608, + "step": 19 + }, + { + "epoch": 0.2764976958525346, + "grad_norm": 25.81206703186035, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5043, + "step": 20 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 25.539344787597656, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.3673, + "step": 21 + }, + { + "epoch": 0.30414746543778803, + "grad_norm": 25.097164154052734, + "learning_rate": 1.1e-06, + "loss": 1.2029, + "step": 22 + }, + { + "epoch": 0.31797235023041476, + "grad_norm": 24.619497299194336, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.0458, + "step": 23 + }, + { + "epoch": 0.3317972350230415, + "grad_norm": 23.820302963256836, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.8723, + "step": 24 + }, + { + "epoch": 0.3456221198156682, + "grad_norm": 23.12735939025879, + "learning_rate": 1.25e-06, + "loss": 0.7183, + "step": 25 + }, + { + "epoch": 0.35944700460829493, + "grad_norm": 20.127134323120117, + "learning_rate": 1.3e-06, + "loss": 0.5248, + "step": 26 + }, + { + "epoch": 0.37327188940092165, + "grad_norm": 15.901495933532715, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.3689, + "step": 27 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 11.053832054138184, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.2482, + "step": 28 + }, + { + "epoch": 0.4009216589861751, + "grad_norm": 7.248495578765869, + "learning_rate": 1.45e-06, + "loss": 0.1847, + "step": 29 + }, + { + "epoch": 0.4147465437788018, + "grad_norm": 5.378540515899658, + "learning_rate": 1.5e-06, + "loss": 0.1423, + "step": 30 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 3.8371808528900146, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1152, + "step": 31 + }, + { + "epoch": 0.4423963133640553, + "grad_norm": 2.2655274868011475, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0845, + "step": 32 + }, + { + "epoch": 0.45622119815668205, + "grad_norm": 1.5746861696243286, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0711, + "step": 33 + }, + { + "epoch": 0.4700460829493088, + "grad_norm": 1.3510947227478027, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0734, + "step": 34 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.9737389087677002, + "learning_rate": 1.75e-06, + "loss": 0.0651, + "step": 35 + }, + { + "epoch": 0.4976958525345622, + "grad_norm": 0.9815284609794617, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0593, + "step": 36 + }, + { + "epoch": 0.511520737327189, + "grad_norm": 0.8567912578582764, + "learning_rate": 1.85e-06, + "loss": 0.0543, + "step": 37 + }, + { + "epoch": 0.5253456221198156, + "grad_norm": 0.6773302555084229, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0622, + "step": 38 + }, + { + "epoch": 0.5391705069124424, + "grad_norm": 0.49936285614967346, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0511, + "step": 39 + }, + { + "epoch": 0.5529953917050692, + "grad_norm": 0.6253588795661926, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0478, + "step": 40 + }, + { + "epoch": 0.5668202764976958, + "grad_norm": 0.5103089809417725, + "learning_rate": 2.05e-06, + "loss": 0.0465, + "step": 41 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.29294702410697937, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0456, + "step": 42 + }, + { + "epoch": 0.5944700460829493, + "grad_norm": 0.4237954616546631, + "learning_rate": 2.15e-06, + "loss": 0.0501, + "step": 43 + }, + { + "epoch": 0.6082949308755761, + "grad_norm": 0.42243412137031555, + "learning_rate": 2.2e-06, + "loss": 0.0388, + "step": 44 + }, + { + "epoch": 0.6221198156682027, + "grad_norm": 0.37881818413734436, + "learning_rate": 2.25e-06, + "loss": 0.0415, + "step": 45 + }, + { + "epoch": 0.6359447004608295, + "grad_norm": 0.4941152036190033, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.045, + "step": 46 + }, + { + "epoch": 0.6497695852534562, + "grad_norm": 0.3046450912952423, + "learning_rate": 2.35e-06, + "loss": 0.0386, + "step": 47 + }, + { + "epoch": 0.663594470046083, + "grad_norm": 0.39361852407455444, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0447, + "step": 48 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.5190001130104065, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0364, + "step": 49 + }, + { + "epoch": 0.6912442396313364, + "grad_norm": 0.372072696685791, + "learning_rate": 2.5e-06, + "loss": 0.043, + "step": 50 + }, + { + "epoch": 0.7050691244239631, + "grad_norm": 0.3756551146507263, + "learning_rate": 2.55e-06, + "loss": 0.0424, + "step": 51 + }, + { + "epoch": 0.7188940092165899, + "grad_norm": 0.4593554437160492, + "learning_rate": 2.6e-06, + "loss": 0.0387, + "step": 52 + }, + { + "epoch": 0.7327188940092166, + "grad_norm": 0.2931855618953705, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0396, + "step": 53 + }, + { + "epoch": 0.7465437788018433, + "grad_norm": 0.38429534435272217, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0373, + "step": 54 + }, + { + "epoch": 0.7603686635944701, + "grad_norm": 0.3506857752799988, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.04, + "step": 55 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.29847028851509094, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0369, + "step": 56 + }, + { + "epoch": 0.7880184331797235, + "grad_norm": 0.3653375506401062, + "learning_rate": 2.85e-06, + "loss": 0.0396, + "step": 57 + }, + { + "epoch": 0.8018433179723502, + "grad_norm": 0.3163083791732788, + "learning_rate": 2.9e-06, + "loss": 0.0337, + "step": 58 + }, + { + "epoch": 0.815668202764977, + "grad_norm": 0.3734363615512848, + "learning_rate": 2.95e-06, + "loss": 0.0327, + "step": 59 + }, + { + "epoch": 0.8294930875576036, + "grad_norm": 0.29547712206840515, + "learning_rate": 3e-06, + "loss": 0.0365, + "step": 60 + }, + { + "epoch": 0.8433179723502304, + "grad_norm": 0.4041007161140442, + "learning_rate": 3.05e-06, + "loss": 0.038, + "step": 61 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.3602149784564972, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.033, + "step": 62 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 0.2948857545852661, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0386, + "step": 63 + }, + { + "epoch": 0.8847926267281107, + "grad_norm": 0.39098358154296875, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0323, + "step": 64 + }, + { + "epoch": 0.8986175115207373, + "grad_norm": 0.3692062795162201, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0309, + "step": 65 + }, + { + "epoch": 0.9124423963133641, + "grad_norm": 0.3967229425907135, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0346, + "step": 66 + }, + { + "epoch": 0.9262672811059908, + "grad_norm": 0.47776708006858826, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0355, + "step": 67 + }, + { + "epoch": 0.9400921658986175, + "grad_norm": 0.21545131504535675, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0294, + "step": 68 + }, + { + "epoch": 0.9539170506912442, + "grad_norm": 0.23738539218902588, + "learning_rate": 3.45e-06, + "loss": 0.0308, + "step": 69 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.29174014925956726, + "learning_rate": 3.5e-06, + "loss": 0.0312, + "step": 70 + }, + { + "epoch": 0.9815668202764977, + "grad_norm": 0.38475602865219116, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0324, + "step": 71 + }, + { + "epoch": 0.9953917050691244, + "grad_norm": 0.4077378809452057, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0297, + "step": 72 + }, + { + "epoch": 1.0, + "grad_norm": 0.4077378809452057, + "learning_rate": 3.65e-06, + "loss": 0.031, + "step": 73 + }, + { + "epoch": 1.0138248847926268, + "grad_norm": 0.46581539511680603, + "learning_rate": 3.7e-06, + "loss": 0.0313, + "step": 74 + }, + { + "epoch": 1.0276497695852536, + "grad_norm": 0.24417200684547424, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.027, + "step": 75 + }, + { + "epoch": 1.0414746543778801, + "grad_norm": 0.20425117015838623, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0307, + "step": 76 + }, + { + "epoch": 1.055299539170507, + "grad_norm": 0.3578161597251892, + "learning_rate": 3.85e-06, + "loss": 0.0312, + "step": 77 + }, + { + "epoch": 1.0691244239631337, + "grad_norm": 0.39486679434776306, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0294, + "step": 78 + }, + { + "epoch": 1.0829493087557605, + "grad_norm": 0.3932795226573944, + "learning_rate": 3.95e-06, + "loss": 0.0307, + "step": 79 + }, + { + "epoch": 1.096774193548387, + "grad_norm": 0.2946235239505768, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0257, + "step": 80 + }, + { + "epoch": 1.1105990783410138, + "grad_norm": 0.3318672776222229, + "learning_rate": 4.05e-06, + "loss": 0.0296, + "step": 81 + }, + { + "epoch": 1.1244239631336406, + "grad_norm": 0.23701588809490204, + "learning_rate": 4.1e-06, + "loss": 0.0298, + "step": 82 + }, + { + "epoch": 1.1382488479262673, + "grad_norm": 0.2415941059589386, + "learning_rate": 4.15e-06, + "loss": 0.0256, + "step": 83 + }, + { + "epoch": 1.1520737327188941, + "grad_norm": 0.24098087847232819, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0263, + "step": 84 + }, + { + "epoch": 1.1658986175115207, + "grad_norm": 0.3530103862285614, + "learning_rate": 4.25e-06, + "loss": 0.0308, + "step": 85 + }, + { + "epoch": 1.1797235023041475, + "grad_norm": 0.2382838875055313, + "learning_rate": 4.3e-06, + "loss": 0.0254, + "step": 86 + }, + { + "epoch": 1.1935483870967742, + "grad_norm": 0.2670588791370392, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0255, + "step": 87 + }, + { + "epoch": 1.2073732718894008, + "grad_norm": 0.30723804235458374, + "learning_rate": 4.4e-06, + "loss": 0.0263, + "step": 88 + }, + { + "epoch": 1.2211981566820276, + "grad_norm": 0.505890965461731, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0265, + "step": 89 + }, + { + "epoch": 1.2350230414746544, + "grad_norm": 0.24307991564273834, + "learning_rate": 4.5e-06, + "loss": 0.0227, + "step": 90 + }, + { + "epoch": 1.2488479262672811, + "grad_norm": 0.2198561429977417, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0261, + "step": 91 + }, + { + "epoch": 1.262672811059908, + "grad_norm": 0.2435183823108673, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0225, + "step": 92 + }, + { + "epoch": 1.2764976958525347, + "grad_norm": 0.18837811052799225, + "learning_rate": 4.65e-06, + "loss": 0.0218, + "step": 93 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 0.3818771541118622, + "learning_rate": 4.7e-06, + "loss": 0.0223, + "step": 94 + }, + { + "epoch": 1.304147465437788, + "grad_norm": 0.2358720600605011, + "learning_rate": 4.75e-06, + "loss": 0.0204, + "step": 95 + }, + { + "epoch": 1.3179723502304148, + "grad_norm": 0.25374144315719604, + "learning_rate": 4.800000000000001e-06, + "loss": 0.022, + "step": 96 + }, + { + "epoch": 1.3317972350230414, + "grad_norm": 0.36181601881980896, + "learning_rate": 4.85e-06, + "loss": 0.0244, + "step": 97 + }, + { + "epoch": 1.3456221198156681, + "grad_norm": 0.3156590759754181, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0233, + "step": 98 + }, + { + "epoch": 1.359447004608295, + "grad_norm": 0.21958638727664948, + "learning_rate": 4.95e-06, + "loss": 0.0218, + "step": 99 + }, + { + "epoch": 1.3732718894009217, + "grad_norm": 0.34455621242523193, + "learning_rate": 5e-06, + "loss": 0.0267, + "step": 100 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 0.283086359500885, + "learning_rate": 4.999888074163108e-06, + "loss": 0.0238, + "step": 101 + }, + { + "epoch": 1.400921658986175, + "grad_norm": 0.28856486082077026, + "learning_rate": 4.999552306674345e-06, + "loss": 0.0186, + "step": 102 + }, + { + "epoch": 1.4147465437788018, + "grad_norm": 0.26721692085266113, + "learning_rate": 4.998992727598557e-06, + "loss": 0.0193, + "step": 103 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.3459971249103546, + "learning_rate": 4.998209387040829e-06, + "loss": 0.0218, + "step": 104 + }, + { + "epoch": 1.4423963133640554, + "grad_norm": 0.25979122519493103, + "learning_rate": 4.9972023551419995e-06, + "loss": 0.0216, + "step": 105 + }, + { + "epoch": 1.456221198156682, + "grad_norm": 0.19960424304008484, + "learning_rate": 4.995971722072379e-06, + "loss": 0.0176, + "step": 106 + }, + { + "epoch": 1.4700460829493087, + "grad_norm": 0.2529441714286804, + "learning_rate": 4.9945175980236745e-06, + "loss": 0.0181, + "step": 107 + }, + { + "epoch": 1.4838709677419355, + "grad_norm": 0.2690267264842987, + "learning_rate": 4.992840113199131e-06, + "loss": 0.0196, + "step": 108 + }, + { + "epoch": 1.4976958525345623, + "grad_norm": 0.3516470789909363, + "learning_rate": 4.990939417801859e-06, + "loss": 0.0182, + "step": 109 + }, + { + "epoch": 1.511520737327189, + "grad_norm": 0.30167508125305176, + "learning_rate": 4.988815682021398e-06, + "loss": 0.0205, + "step": 110 + }, + { + "epoch": 1.5253456221198156, + "grad_norm": 0.3920849859714508, + "learning_rate": 4.986469096018472e-06, + "loss": 0.0177, + "step": 111 + }, + { + "epoch": 1.5391705069124424, + "grad_norm": 0.3274078369140625, + "learning_rate": 4.983899869907963e-06, + "loss": 0.0185, + "step": 112 + }, + { + "epoch": 1.5529953917050692, + "grad_norm": 0.2237282395362854, + "learning_rate": 4.981108233740096e-06, + "loss": 0.016, + "step": 113 + }, + { + "epoch": 1.5668202764976957, + "grad_norm": 0.23966379463672638, + "learning_rate": 4.978094437479843e-06, + "loss": 0.0183, + "step": 114 + }, + { + "epoch": 1.5806451612903225, + "grad_norm": 0.4027673602104187, + "learning_rate": 4.97485875098454e-06, + "loss": 0.0171, + "step": 115 + }, + { + "epoch": 1.5944700460829493, + "grad_norm": 0.24082835018634796, + "learning_rate": 4.971401463979722e-06, + "loss": 0.016, + "step": 116 + }, + { + "epoch": 1.608294930875576, + "grad_norm": 0.19387558102607727, + "learning_rate": 4.967722886033181e-06, + "loss": 0.0165, + "step": 117 + }, + { + "epoch": 1.6221198156682028, + "grad_norm": 0.33696162700653076, + "learning_rate": 4.963823346527249e-06, + "loss": 0.0154, + "step": 118 + }, + { + "epoch": 1.6359447004608296, + "grad_norm": 0.30290740728378296, + "learning_rate": 4.959703194629304e-06, + "loss": 0.0175, + "step": 119 + }, + { + "epoch": 1.6497695852534562, + "grad_norm": 0.3781787157058716, + "learning_rate": 4.955362799260507e-06, + "loss": 0.0145, + "step": 120 + }, + { + "epoch": 1.663594470046083, + "grad_norm": 0.39995357394218445, + "learning_rate": 4.950802549062764e-06, + "loss": 0.015, + "step": 121 + }, + { + "epoch": 1.6774193548387095, + "grad_norm": 0.19926570355892181, + "learning_rate": 4.946022852363932e-06, + "loss": 0.0135, + "step": 122 + }, + { + "epoch": 1.6912442396313363, + "grad_norm": 0.22450515627861023, + "learning_rate": 4.9410241371412525e-06, + "loss": 0.0135, + "step": 123 + }, + { + "epoch": 1.705069124423963, + "grad_norm": 0.3588384985923767, + "learning_rate": 4.935806850983034e-06, + "loss": 0.0125, + "step": 124 + }, + { + "epoch": 1.7188940092165899, + "grad_norm": 0.28571122884750366, + "learning_rate": 4.9303714610485705e-06, + "loss": 0.0166, + "step": 125 + }, + { + "epoch": 1.7327188940092166, + "grad_norm": 0.3496967852115631, + "learning_rate": 4.924718454026318e-06, + "loss": 0.0139, + "step": 126 + }, + { + "epoch": 1.7465437788018434, + "grad_norm": 0.3279854357242584, + "learning_rate": 4.918848336090309e-06, + "loss": 0.0133, + "step": 127 + }, + { + "epoch": 1.7603686635944702, + "grad_norm": 0.19201801717281342, + "learning_rate": 4.912761632854834e-06, + "loss": 0.0151, + "step": 128 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 0.27701929211616516, + "learning_rate": 4.906458889327375e-06, + "loss": 0.0148, + "step": 129 + }, + { + "epoch": 1.7880184331797235, + "grad_norm": 0.2757968008518219, + "learning_rate": 4.899940669859807e-06, + "loss": 0.0118, + "step": 130 + }, + { + "epoch": 1.80184331797235, + "grad_norm": 0.18373191356658936, + "learning_rate": 4.893207558097867e-06, + "loss": 0.0149, + "step": 131 + }, + { + "epoch": 1.8156682027649769, + "grad_norm": 0.2116280496120453, + "learning_rate": 4.8862601569288885e-06, + "loss": 0.0129, + "step": 132 + }, + { + "epoch": 1.8294930875576036, + "grad_norm": 0.30384117364883423, + "learning_rate": 4.879099088427824e-06, + "loss": 0.0136, + "step": 133 + }, + { + "epoch": 1.8433179723502304, + "grad_norm": 0.3766787052154541, + "learning_rate": 4.871724993801541e-06, + "loss": 0.0123, + "step": 134 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.3401263356208801, + "learning_rate": 4.864138533331411e-06, + "loss": 0.0122, + "step": 135 + }, + { + "epoch": 1.870967741935484, + "grad_norm": 0.24321958422660828, + "learning_rate": 4.8563403863141825e-06, + "loss": 0.0123, + "step": 136 + }, + { + "epoch": 1.8847926267281108, + "grad_norm": 0.16918110847473145, + "learning_rate": 4.84833125100116e-06, + "loss": 0.0104, + "step": 137 + }, + { + "epoch": 1.8986175115207373, + "grad_norm": 0.23489230871200562, + "learning_rate": 4.840111844535682e-06, + "loss": 0.0122, + "step": 138 + }, + { + "epoch": 1.912442396313364, + "grad_norm": 0.32796236872673035, + "learning_rate": 4.8316829028889076e-06, + "loss": 0.0109, + "step": 139 + }, + { + "epoch": 1.9262672811059907, + "grad_norm": 0.24210475385189056, + "learning_rate": 4.823045180793914e-06, + "loss": 0.0118, + "step": 140 + }, + { + "epoch": 1.9400921658986174, + "grad_norm": 0.3450548052787781, + "learning_rate": 4.8141994516781196e-06, + "loss": 0.0115, + "step": 141 + }, + { + "epoch": 1.9539170506912442, + "grad_norm": 0.23163923621177673, + "learning_rate": 4.805146507594034e-06, + "loss": 0.0122, + "step": 142 + }, + { + "epoch": 1.967741935483871, + "grad_norm": 0.8197745084762573, + "learning_rate": 4.7958871591483305e-06, + "loss": 0.0101, + "step": 143 + }, + { + "epoch": 1.9815668202764978, + "grad_norm": 0.2917576730251312, + "learning_rate": 4.786422235429269e-06, + "loss": 0.0078, + "step": 144 + }, + { + "epoch": 1.9953917050691246, + "grad_norm": 0.24417108297348022, + "learning_rate": 4.776752583932455e-06, + "loss": 0.0119, + "step": 145 + }, + { + "epoch": 2.0, + "grad_norm": 0.24417108297348022, + "learning_rate": 4.766879070484957e-06, + "loss": 0.0089, + "step": 146 + }, + { + "epoch": 2.013824884792627, + "grad_norm": 0.4215025305747986, + "learning_rate": 4.756802579167781e-06, + "loss": 0.007, + "step": 147 + }, + { + "epoch": 2.0276497695852536, + "grad_norm": 0.2002098709344864, + "learning_rate": 4.746524012236706e-06, + "loss": 0.0078, + "step": 148 + }, + { + "epoch": 2.0414746543778803, + "grad_norm": 0.16432569921016693, + "learning_rate": 4.736044290041496e-06, + "loss": 0.0074, + "step": 149 + }, + { + "epoch": 2.055299539170507, + "grad_norm": 0.2516174018383026, + "learning_rate": 4.725364350943492e-06, + "loss": 0.0067, + "step": 150 + }, + { + "epoch": 2.0691244239631335, + "grad_norm": 0.24242427945137024, + "learning_rate": 4.714485151231593e-06, + "loss": 0.0083, + "step": 151 + }, + { + "epoch": 2.0829493087557602, + "grad_norm": 0.22929197549819946, + "learning_rate": 4.703407665036622e-06, + "loss": 0.0061, + "step": 152 + }, + { + "epoch": 2.096774193548387, + "grad_norm": 0.2929408550262451, + "learning_rate": 4.692132884244113e-06, + "loss": 0.0064, + "step": 153 + }, + { + "epoch": 2.110599078341014, + "grad_norm": 0.22497303783893585, + "learning_rate": 4.680661818405485e-06, + "loss": 0.0061, + "step": 154 + }, + { + "epoch": 2.1244239631336406, + "grad_norm": 0.13698536157608032, + "learning_rate": 4.668995494647653e-06, + "loss": 0.0059, + "step": 155 + }, + { + "epoch": 2.1382488479262673, + "grad_norm": 0.32037150859832764, + "learning_rate": 4.657134957581057e-06, + "loss": 0.0067, + "step": 156 + }, + { + "epoch": 2.152073732718894, + "grad_norm": 0.19389067590236664, + "learning_rate": 4.645081269206128e-06, + "loss": 0.0062, + "step": 157 + }, + { + "epoch": 2.165898617511521, + "grad_norm": 0.2791127562522888, + "learning_rate": 4.632835508818192e-06, + "loss": 0.0058, + "step": 158 + }, + { + "epoch": 2.1797235023041477, + "grad_norm": 0.2178739458322525, + "learning_rate": 4.620398772910833e-06, + "loss": 0.0056, + "step": 159 + }, + { + "epoch": 2.193548387096774, + "grad_norm": 0.29685622453689575, + "learning_rate": 4.607772175077712e-06, + "loss": 0.0055, + "step": 160 + }, + { + "epoch": 2.207373271889401, + "grad_norm": 0.6792906522750854, + "learning_rate": 4.59495684591285e-06, + "loss": 0.0057, + "step": 161 + }, + { + "epoch": 2.2211981566820276, + "grad_norm": 0.17910148203372955, + "learning_rate": 4.581953932909403e-06, + "loss": 0.0046, + "step": 162 + }, + { + "epoch": 2.2350230414746544, + "grad_norm": 0.12593543529510498, + "learning_rate": 4.5687646003569055e-06, + "loss": 0.0046, + "step": 163 + }, + { + "epoch": 2.248847926267281, + "grad_norm": 0.15383680164813995, + "learning_rate": 4.555390029237026e-06, + "loss": 0.0059, + "step": 164 + }, + { + "epoch": 2.262672811059908, + "grad_norm": 0.2324540764093399, + "learning_rate": 4.541831417117815e-06, + "loss": 0.0067, + "step": 165 + }, + { + "epoch": 2.2764976958525347, + "grad_norm": 0.21278905868530273, + "learning_rate": 4.528089978046481e-06, + "loss": 0.0054, + "step": 166 + }, + { + "epoch": 2.2903225806451615, + "grad_norm": 0.2499057948589325, + "learning_rate": 4.514166942440679e-06, + "loss": 0.003, + "step": 167 + }, + { + "epoch": 2.3041474654377883, + "grad_norm": 0.1734611839056015, + "learning_rate": 4.5000635569783365e-06, + "loss": 0.0043, + "step": 168 + }, + { + "epoch": 2.3179723502304146, + "grad_norm": 0.17815802991390228, + "learning_rate": 4.4857810844860325e-06, + "loss": 0.0048, + "step": 169 + }, + { + "epoch": 2.3317972350230414, + "grad_norm": 0.22731409966945648, + "learning_rate": 4.471320803825915e-06, + "loss": 0.0034, + "step": 170 + }, + { + "epoch": 2.345622119815668, + "grad_norm": 0.23811140656471252, + "learning_rate": 4.4566840097811956e-06, + "loss": 0.0029, + "step": 171 + }, + { + "epoch": 2.359447004608295, + "grad_norm": 0.17744024097919464, + "learning_rate": 4.4418720129402145e-06, + "loss": 0.0029, + "step": 172 + }, + { + "epoch": 2.3732718894009217, + "grad_norm": 0.24912229180335999, + "learning_rate": 4.426886139579083e-06, + "loss": 0.0049, + "step": 173 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 0.17039696872234344, + "learning_rate": 4.411727731542937e-06, + "loss": 0.003, + "step": 174 + }, + { + "epoch": 2.4009216589861753, + "grad_norm": 0.3089725375175476, + "learning_rate": 4.39639814612578e-06, + "loss": 0.0034, + "step": 175 + }, + { + "epoch": 2.4147465437788016, + "grad_norm": 0.22647598385810852, + "learning_rate": 4.3808987559489536e-06, + "loss": 0.0058, + "step": 176 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.19015835225582123, + "learning_rate": 4.365230948838232e-06, + "loss": 0.004, + "step": 177 + }, + { + "epoch": 2.442396313364055, + "grad_norm": 0.1825973391532898, + "learning_rate": 4.349396127699552e-06, + "loss": 0.0032, + "step": 178 + }, + { + "epoch": 2.456221198156682, + "grad_norm": 0.15705449879169464, + "learning_rate": 4.3333957103934025e-06, + "loss": 0.0035, + "step": 179 + }, + { + "epoch": 2.4700460829493087, + "grad_norm": 0.19110225141048431, + "learning_rate": 4.317231129607859e-06, + "loss": 0.0019, + "step": 180 + }, + { + "epoch": 2.4838709677419355, + "grad_norm": 0.1481270045042038, + "learning_rate": 4.30090383273031e-06, + "loss": 0.0035, + "step": 181 + }, + { + "epoch": 2.4976958525345623, + "grad_norm": 0.19533571600914001, + "learning_rate": 4.2844152817178476e-06, + "loss": 0.0023, + "step": 182 + }, + { + "epoch": 2.511520737327189, + "grad_norm": 0.1991293579339981, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0025, + "step": 183 + }, + { + "epoch": 2.525345622119816, + "grad_norm": 0.22637878358364105, + "learning_rate": 4.2509603371783776e-06, + "loss": 0.0029, + "step": 184 + }, + { + "epoch": 2.539170506912442, + "grad_norm": 0.21984712779521942, + "learning_rate": 4.233996939229502e-06, + "loss": 0.0035, + "step": 185 + }, + { + "epoch": 2.5529953917050694, + "grad_norm": 0.25706061720848083, + "learning_rate": 4.216878278033753e-06, + "loss": 0.0033, + "step": 186 + }, + { + "epoch": 2.5668202764976957, + "grad_norm": 0.224118173122406, + "learning_rate": 4.199605886407515e-06, + "loss": 0.0017, + "step": 187 + }, + { + "epoch": 2.5806451612903225, + "grad_norm": 0.0781751424074173, + "learning_rate": 4.1821813109322975e-06, + "loss": 0.002, + "step": 188 + }, + { + "epoch": 2.5944700460829493, + "grad_norm": 0.2209765911102295, + "learning_rate": 4.164606111816256e-06, + "loss": 0.0018, + "step": 189 + }, + { + "epoch": 2.608294930875576, + "grad_norm": 0.12815824151039124, + "learning_rate": 4.146881862754485e-06, + "loss": 0.003, + "step": 190 + }, + { + "epoch": 2.622119815668203, + "grad_norm": 0.3006991147994995, + "learning_rate": 4.129010150788112e-06, + "loss": 0.0022, + "step": 191 + }, + { + "epoch": 2.6359447004608296, + "grad_norm": 0.19085584580898285, + "learning_rate": 4.110992576162193e-06, + "loss": 0.0026, + "step": 192 + }, + { + "epoch": 2.6497695852534564, + "grad_norm": 0.13027659058570862, + "learning_rate": 4.092830752182423e-06, + "loss": 0.0015, + "step": 193 + }, + { + "epoch": 2.6635944700460827, + "grad_norm": 0.16998590528964996, + "learning_rate": 4.074526305070679e-06, + "loss": 0.0018, + "step": 194 + }, + { + "epoch": 2.6774193548387095, + "grad_norm": 0.1743537187576294, + "learning_rate": 4.056080873819412e-06, + "loss": 0.0022, + "step": 195 + }, + { + "epoch": 2.6912442396313363, + "grad_norm": 0.3566405177116394, + "learning_rate": 4.037496110044885e-06, + "loss": 0.0018, + "step": 196 + }, + { + "epoch": 2.705069124423963, + "grad_norm": 0.274739146232605, + "learning_rate": 4.018773677839289e-06, + "loss": 0.0012, + "step": 197 + }, + { + "epoch": 2.71889400921659, + "grad_norm": 0.12038746476173401, + "learning_rate": 3.999915253621739e-06, + "loss": 0.0013, + "step": 198 + }, + { + "epoch": 2.7327188940092166, + "grad_norm": 0.12693172693252563, + "learning_rate": 3.980922525988167e-06, + "loss": 0.0017, + "step": 199 + }, + { + "epoch": 2.7465437788018434, + "grad_norm": 0.11907753348350525, + "learning_rate": 3.961797195560118e-06, + "loss": 0.001, + "step": 200 + }, + { + "epoch": 2.76036866359447, + "grad_norm": 0.1901165395975113, + "learning_rate": 3.942540974832486e-06, + "loss": 0.0028, + "step": 201 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 0.2039843052625656, + "learning_rate": 3.9231555880201655e-06, + "loss": 0.0011, + "step": 202 + }, + { + "epoch": 2.7880184331797233, + "grad_norm": 0.16181506216526031, + "learning_rate": 3.903642770903671e-06, + "loss": 0.003, + "step": 203 + }, + { + "epoch": 2.80184331797235, + "grad_norm": 0.13345211744308472, + "learning_rate": 3.884004270673711e-06, + "loss": 0.0023, + "step": 204 + }, + { + "epoch": 2.815668202764977, + "grad_norm": 0.19453725218772888, + "learning_rate": 3.864241845774746e-06, + "loss": 0.001, + "step": 205 + }, + { + "epoch": 2.8294930875576036, + "grad_norm": 0.18157535791397095, + "learning_rate": 3.844357265747531e-06, + "loss": 0.0029, + "step": 206 + }, + { + "epoch": 2.8433179723502304, + "grad_norm": 0.17876467108726501, + "learning_rate": 3.8243523110706736e-06, + "loss": 0.0018, + "step": 207 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.13000421226024628, + "learning_rate": 3.8042287730012117e-06, + "loss": 0.0011, + "step": 208 + }, + { + "epoch": 2.870967741935484, + "grad_norm": 0.08808371424674988, + "learning_rate": 3.7839884534142157e-06, + "loss": 0.0018, + "step": 209 + }, + { + "epoch": 2.8847926267281108, + "grad_norm": 0.32318148016929626, + "learning_rate": 3.7636331646414524e-06, + "loss": 0.0012, + "step": 210 + }, + { + "epoch": 2.8986175115207375, + "grad_norm": 0.1259954422712326, + "learning_rate": 3.7431647293091076e-06, + "loss": 0.0012, + "step": 211 + }, + { + "epoch": 2.912442396313364, + "grad_norm": 0.1344563215970993, + "learning_rate": 3.7225849801745835e-06, + "loss": 0.0006, + "step": 212 + }, + { + "epoch": 2.9262672811059907, + "grad_norm": 0.09105626493692398, + "learning_rate": 3.701895759962397e-06, + "loss": 0.0007, + "step": 213 + }, + { + "epoch": 2.9400921658986174, + "grad_norm": 0.11718853563070297, + "learning_rate": 3.6810989211991777e-06, + "loss": 0.0022, + "step": 214 + }, + { + "epoch": 2.953917050691244, + "grad_norm": 0.10988112539052963, + "learning_rate": 3.6601963260477923e-06, + "loss": 0.0007, + "step": 215 + }, + { + "epoch": 2.967741935483871, + "grad_norm": 0.12010538578033447, + "learning_rate": 3.6391898461406045e-06, + "loss": 0.0014, + "step": 216 + }, + { + "epoch": 2.9815668202764978, + "grad_norm": 0.12934529781341553, + "learning_rate": 3.6180813624118898e-06, + "loss": 0.001, + "step": 217 + }, + { + "epoch": 2.9953917050691246, + "grad_norm": 0.05664035677909851, + "learning_rate": 3.5968727649294134e-06, + "loss": 0.0002, + "step": 218 + }, + { + "epoch": 3.0, + "grad_norm": 0.07633747160434723, + "learning_rate": 3.575565952725193e-06, + "loss": 0.0002, + "step": 219 + }, + { + "epoch": 3.013824884792627, + "grad_norm": 0.16964735090732574, + "learning_rate": 3.55416283362546e-06, + "loss": 0.0005, + "step": 220 + }, + { + "epoch": 3.0276497695852536, + "grad_norm": 0.03826030716300011, + "learning_rate": 3.5326653240798283e-06, + "loss": 0.0003, + "step": 221 + }, + { + "epoch": 3.0414746543778803, + "grad_norm": 0.05900357663631439, + "learning_rate": 3.5110753489896924e-06, + "loss": 0.0006, + "step": 222 + }, + { + "epoch": 3.055299539170507, + "grad_norm": 0.06874338537454605, + "learning_rate": 3.4893948415358803e-06, + "loss": 0.0002, + "step": 223 + }, + { + "epoch": 3.0691244239631335, + "grad_norm": 0.10445930808782578, + "learning_rate": 3.4676257430055438e-06, + "loss": 0.0011, + "step": 224 + }, + { + "epoch": 3.0829493087557602, + "grad_norm": 0.03757224604487419, + "learning_rate": 3.4457700026183378e-06, + "loss": 0.0002, + "step": 225 + }, + { + "epoch": 3.096774193548387, + "grad_norm": 0.2678232491016388, + "learning_rate": 3.4238295773518924e-06, + "loss": 0.0012, + "step": 226 + }, + { + "epoch": 3.110599078341014, + "grad_norm": 0.11278262734413147, + "learning_rate": 3.4018064317665745e-06, + "loss": 0.0003, + "step": 227 + }, + { + "epoch": 3.1244239631336406, + "grad_norm": 0.03823389112949371, + "learning_rate": 3.3797025378295826e-06, + "loss": 0.0002, + "step": 228 + }, + { + "epoch": 3.1382488479262673, + "grad_norm": 0.015309945680201054, + "learning_rate": 3.357519874738382e-06, + "loss": 0.0, + "step": 229 + }, + { + "epoch": 3.152073732718894, + "grad_norm": 0.12372211366891861, + "learning_rate": 3.3352604287434752e-06, + "loss": 0.0007, + "step": 230 + }, + { + "epoch": 3.165898617511521, + "grad_norm": 0.062292926013469696, + "learning_rate": 3.31292619297056e-06, + "loss": 0.0003, + "step": 231 + }, + { + "epoch": 3.1797235023041477, + "grad_norm": 0.02390543930232525, + "learning_rate": 3.29051916724206e-06, + "loss": 0.0001, + "step": 232 + }, + { + "epoch": 3.193548387096774, + "grad_norm": 0.035650208592414856, + "learning_rate": 3.2680413578980623e-06, + "loss": 0.0001, + "step": 233 + }, + { + "epoch": 3.207373271889401, + "grad_norm": 0.04304494708776474, + "learning_rate": 3.245494777616664e-06, + "loss": 0.0001, + "step": 234 + }, + { + "epoch": 3.2211981566820276, + "grad_norm": 0.07038014382123947, + "learning_rate": 3.2228814452337587e-06, + "loss": 0.0005, + "step": 235 + }, + { + "epoch": 3.2350230414746544, + "grad_norm": 0.18309231102466583, + "learning_rate": 3.2002033855622683e-06, + "loss": 0.0005, + "step": 236 + }, + { + "epoch": 3.248847926267281, + "grad_norm": 0.04949740692973137, + "learning_rate": 3.177462629210838e-06, + "loss": 0.0002, + "step": 237 + }, + { + "epoch": 3.262672811059908, + "grad_norm": 0.0319606214761734, + "learning_rate": 3.154661212402017e-06, + "loss": 0.0001, + "step": 238 + }, + { + "epoch": 3.2764976958525347, + "grad_norm": 0.062357429414987564, + "learning_rate": 3.131801176789934e-06, + "loss": 0.0003, + "step": 239 + }, + { + "epoch": 3.2903225806451615, + "grad_norm": 0.060603659600019455, + "learning_rate": 3.1088845692774798e-06, + "loss": 0.0003, + "step": 240 + }, + { + "epoch": 3.3041474654377883, + "grad_norm": 0.12379086762666702, + "learning_rate": 3.0859134418330373e-06, + "loss": 0.0013, + "step": 241 + }, + { + "epoch": 3.3179723502304146, + "grad_norm": 0.028559578582644463, + "learning_rate": 3.0628898513067357e-06, + "loss": 0.0002, + "step": 242 + }, + { + "epoch": 3.3317972350230414, + "grad_norm": 0.04983198642730713, + "learning_rate": 3.0398158592462847e-06, + "loss": 0.0001, + "step": 243 + }, + { + "epoch": 3.345622119815668, + "grad_norm": 0.07023701816797256, + "learning_rate": 3.0166935317123824e-06, + "loss": 0.0009, + "step": 244 + }, + { + "epoch": 3.359447004608295, + "grad_norm": 0.046779777854681015, + "learning_rate": 2.9935249390937184e-06, + "loss": 0.0002, + "step": 245 + }, + { + "epoch": 3.3732718894009217, + "grad_norm": 0.07187545299530029, + "learning_rate": 2.970312155921585e-06, + "loss": 0.0006, + "step": 246 + }, + { + "epoch": 3.3870967741935485, + "grad_norm": 0.019256649538874626, + "learning_rate": 2.9470572606841295e-06, + "loss": 0.0001, + "step": 247 + }, + { + "epoch": 3.4009216589861753, + "grad_norm": 0.0477205291390419, + "learning_rate": 2.9237623356402423e-06, + "loss": 0.0002, + "step": 248 + }, + { + "epoch": 3.4147465437788016, + "grad_norm": 0.06807561218738556, + "learning_rate": 2.900429466633107e-06, + "loss": 0.0003, + "step": 249 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.1516796499490738, + "learning_rate": 2.8770607429034352e-06, + "loss": 0.0006, + "step": 250 + }, + { + "epoch": 3.442396313364055, + "grad_norm": 0.045213282108306885, + "learning_rate": 2.8536582569023964e-06, + "loss": 0.0001, + "step": 251 + }, + { + "epoch": 3.456221198156682, + "grad_norm": 0.020110802724957466, + "learning_rate": 2.8302241041042564e-06, + "loss": 0.0001, + "step": 252 + }, + { + "epoch": 3.4700460829493087, + "grad_norm": 0.03242102265357971, + "learning_rate": 2.8067603828187446e-06, + "loss": 0.0002, + "step": 253 + }, + { + "epoch": 3.4838709677419355, + "grad_norm": 0.039639636874198914, + "learning_rate": 2.7832691940031755e-06, + "loss": 0.0001, + "step": 254 + }, + { + "epoch": 3.4976958525345623, + "grad_norm": 0.06943561136722565, + "learning_rate": 2.759752641074322e-06, + "loss": 0.0003, + "step": 255 + }, + { + "epoch": 3.511520737327189, + "grad_norm": 0.02593497559428215, + "learning_rate": 2.7362128297200784e-06, + "loss": 0.0001, + "step": 256 + }, + { + "epoch": 3.525345622119816, + "grad_norm": 0.02811415307223797, + "learning_rate": 2.712651867710914e-06, + "loss": 0.0002, + "step": 257 + }, + { + "epoch": 3.539170506912442, + "grad_norm": 0.07381757348775864, + "learning_rate": 2.6890718647111424e-06, + "loss": 0.0003, + "step": 258 + }, + { + "epoch": 3.5529953917050694, + "grad_norm": 0.014391073025763035, + "learning_rate": 2.665474932090017e-06, + "loss": 0.0001, + "step": 259 + }, + { + "epoch": 3.5668202764976957, + "grad_norm": 0.027200503274798393, + "learning_rate": 2.6418631827326857e-06, + "loss": 0.0001, + "step": 260 + }, + { + "epoch": 3.5806451612903225, + "grad_norm": 0.02720312774181366, + "learning_rate": 2.6182387308509927e-06, + "loss": 0.0001, + "step": 261 + }, + { + "epoch": 3.5944700460829493, + "grad_norm": 0.04352420195937157, + "learning_rate": 2.5946036917941765e-06, + "loss": 0.0004, + "step": 262 + }, + { + "epoch": 3.608294930875576, + "grad_norm": 0.03459783270955086, + "learning_rate": 2.570960181859458e-06, + "loss": 0.0001, + "step": 263 + }, + { + "epoch": 3.622119815668203, + "grad_norm": 0.03097033128142357, + "learning_rate": 2.547310318102548e-06, + "loss": 0.0001, + "step": 264 + }, + { + "epoch": 3.6359447004608296, + "grad_norm": 0.0076720695942640305, + "learning_rate": 2.5236562181480794e-06, + "loss": 0.0, + "step": 265 + }, + { + "epoch": 3.6497695852534564, + "grad_norm": 0.023994900286197662, + "learning_rate": 2.5e-06, + "loss": 0.0001, + "step": 266 + }, + { + "epoch": 3.6635944700460827, + "grad_norm": 0.005682840943336487, + "learning_rate": 2.4763437818519205e-06, + "loss": 0.0, + "step": 267 + }, + { + "epoch": 3.6774193548387095, + "grad_norm": 0.030443254858255386, + "learning_rate": 2.4526896818974534e-06, + "loss": 0.0002, + "step": 268 + }, + { + "epoch": 3.6912442396313363, + "grad_norm": 0.008863283321261406, + "learning_rate": 2.429039818140543e-06, + "loss": 0.0, + "step": 269 + }, + { + "epoch": 3.705069124423963, + "grad_norm": 0.009775679558515549, + "learning_rate": 2.405396308205825e-06, + "loss": 0.0001, + "step": 270 + }, + { + "epoch": 3.71889400921659, + "grad_norm": 0.019227130338549614, + "learning_rate": 2.381761269149009e-06, + "loss": 0.0001, + "step": 271 + }, + { + "epoch": 3.7327188940092166, + "grad_norm": 0.037880923599004745, + "learning_rate": 2.358136817267315e-06, + "loss": 0.0002, + "step": 272 + }, + { + "epoch": 3.7465437788018434, + "grad_norm": 0.006014773156493902, + "learning_rate": 2.334525067909983e-06, + "loss": 0.0, + "step": 273 + }, + { + "epoch": 3.76036866359447, + "grad_norm": 0.024770596995949745, + "learning_rate": 2.3109281352888593e-06, + "loss": 0.0001, + "step": 274 + }, + { + "epoch": 3.774193548387097, + "grad_norm": 0.008392867632210255, + "learning_rate": 2.2873481322890866e-06, + "loss": 0.0, + "step": 275 + }, + { + "epoch": 3.7880184331797233, + "grad_norm": 0.030915284529328346, + "learning_rate": 2.263787170279922e-06, + "loss": 0.0002, + "step": 276 + }, + { + "epoch": 3.80184331797235, + "grad_norm": 0.04161324352025986, + "learning_rate": 2.2402473589256793e-06, + "loss": 0.0002, + "step": 277 + }, + { + "epoch": 3.815668202764977, + "grad_norm": 0.04104781523346901, + "learning_rate": 2.2167308059968258e-06, + "loss": 0.0001, + "step": 278 + }, + { + "epoch": 3.8294930875576036, + "grad_norm": 0.02981358952820301, + "learning_rate": 2.193239617181256e-06, + "loss": 0.0002, + "step": 279 + }, + { + "epoch": 3.8433179723502304, + "grad_norm": 0.03616194799542427, + "learning_rate": 2.169775895895745e-06, + "loss": 0.0002, + "step": 280 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.003307241713628173, + "learning_rate": 2.146341743097604e-06, + "loss": 0.0, + "step": 281 + }, + { + "epoch": 3.870967741935484, + "grad_norm": 0.023682212457060814, + "learning_rate": 2.1229392570965656e-06, + "loss": 0.0002, + "step": 282 + }, + { + "epoch": 3.8847926267281108, + "grad_norm": 0.08077914267778397, + "learning_rate": 2.0995705333668948e-06, + "loss": 0.0006, + "step": 283 + }, + { + "epoch": 3.8986175115207375, + "grad_norm": 0.012258109636604786, + "learning_rate": 2.0762376643597586e-06, + "loss": 0.0001, + "step": 284 + }, + { + "epoch": 3.912442396313364, + "grad_norm": 0.012420260347425938, + "learning_rate": 2.0529427393158704e-06, + "loss": 0.0001, + "step": 285 + }, + { + "epoch": 3.9262672811059907, + "grad_norm": 0.03773212060332298, + "learning_rate": 2.0296878440784164e-06, + "loss": 0.0001, + "step": 286 + }, + { + "epoch": 3.9400921658986174, + "grad_norm": 0.03834720700979233, + "learning_rate": 2.006475060906283e-06, + "loss": 0.0001, + "step": 287 + }, + { + "epoch": 3.953917050691244, + "grad_norm": 0.06677021831274033, + "learning_rate": 1.9833064682876175e-06, + "loss": 0.0001, + "step": 288 + } + ], + "logging_steps": 1, + "max_steps": 432, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 72, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8398904142396391e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-288/training_args.bin b/checkpoint-288/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..66c881ded23f97eecfbd08abb955a7188907de16 --- /dev/null +++ b/checkpoint-288/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bccf4859d2ee74fd992386ba4a70a4c4fc6d0da061af69465c1db71ce0f24882 +size 7928 diff --git a/checkpoint-288/zero_to_fp32.py b/checkpoint-288/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-288/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-360/README.md b/checkpoint-360/README.md new file mode 100644 index 0000000000000000000000000000000000000000..037e1a543b9c1891b5c6981f89d5b7c7c9a907ae --- /dev/null +++ b/checkpoint-360/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-70B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-360/adapter_config.json b/checkpoint-360/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3295690c20dff5dc1d1f30f8500f0efb7e255838 --- /dev/null +++ b/checkpoint-360/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "gate_proj", + "v_proj", + "up_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-360/adapter_model.safetensors b/checkpoint-360/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7c57d1ecf43247976a0d8d3e823c1df9db00b106 --- /dev/null +++ b/checkpoint-360/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45ecfa12954e33e517fcc7fc338e39c3dce005a52c53e90256374855782bf03f +size 10829849744 diff --git a/checkpoint-360/global_step357/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-360/global_step357/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..49cbe139ff8124dd5f89a51225aa05830c080b1a --- /dev/null +++ b/checkpoint-360/global_step357/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8928e7717b367125df5a37b0891b5c1547a6c5da5c59babb59f135f5ee3176ec +size 21659418140 diff --git a/checkpoint-360/global_step357/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-360/global_step357/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..21f262e072e1013a70952d927cb59561ebaaf79b --- /dev/null +++ b/checkpoint-360/global_step357/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:245c03854626baff06b5e95480c4c01060f11432fb7a99a2a95960ebff0ebbb4 +size 21659457372 diff --git a/checkpoint-360/global_step357/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-360/global_step357/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c99c78ae4a78df5c16d549c8fd7040842d0d4dab --- /dev/null +++ b/checkpoint-360/global_step357/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:744b0c2698bbfc65353be2150d5432ef338ecc3e3be13e4dc2ddc7ec7b101839 +size 21659417820 diff --git a/checkpoint-360/global_step357/mp_rank_00_model_states.pt b/checkpoint-360/global_step357/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddd13b5e5f5659a840ed98cf8bf7413ce54b2d57 --- /dev/null +++ b/checkpoint-360/global_step357/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e7fb1bab88000d3293c9af31e0df314597b07f2e30ba999be234b47f755be73 +size 11918643933 diff --git a/checkpoint-360/latest b/checkpoint-360/latest new file mode 100644 index 0000000000000000000000000000000000000000..82b5f7ef15b841f6e2bb6d67b0148b2cd3277795 --- /dev/null +++ b/checkpoint-360/latest @@ -0,0 +1 @@ +global_step357 \ No newline at end of file diff --git a/checkpoint-360/rng_state_0.pth b/checkpoint-360/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..0bc0ed10ed4931e058d17bfd0fb09d5722495759 --- /dev/null +++ b/checkpoint-360/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bff760ec5850731a18abe6b8c7e7f6a45c5dd541eaeb048d8066b987e042bcec +size 14768 diff --git a/checkpoint-360/rng_state_1.pth b/checkpoint-360/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0ccb1c7c216b565574e135e61e9381c4b934bf31 --- /dev/null +++ b/checkpoint-360/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa125a965cd3501fecf1885f01cd88d194e523f7182e0fe1710a680e091c3d6f +size 14768 diff --git a/checkpoint-360/rng_state_2.pth b/checkpoint-360/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..54197787343cb5934778d904643b3f1ecb04e999 --- /dev/null +++ b/checkpoint-360/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e47d86fd08c700a35d1a6c5a138a7c3c26edb31ccc6a09cacdb3d44cc3d2640 +size 14768 diff --git a/checkpoint-360/scheduler.pt b/checkpoint-360/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9bea00971e15c82f3fef418b45f5772efc6e041 --- /dev/null +++ b/checkpoint-360/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f12fb5515df7dcf4cc1a3ecda848613d64db62cd311a9066dc53c01e4e1c6a83 +size 1064 diff --git a/checkpoint-360/special_tokens_map.json b/checkpoint-360/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-360/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-360/tokenizer.json b/checkpoint-360/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-360/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-360/tokenizer_config.json b/checkpoint-360/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-360/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-360/trainer_state.json b/checkpoint-360/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..62da6442ba2f3a495813baa5052ae249357f7e08 --- /dev/null +++ b/checkpoint-360/trainer_state.json @@ -0,0 +1,2553 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.940092165898617, + "eval_steps": 500, + "global_step": 360, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013824884792626729, + "grad_norm": 31.00213623046875, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2089, + "step": 1 + }, + { + "epoch": 0.027649769585253458, + "grad_norm": 30.27136993408203, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.1536, + "step": 2 + }, + { + "epoch": 0.041474654377880185, + "grad_norm": 30.48703384399414, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.1581, + "step": 3 + }, + { + "epoch": 0.055299539170506916, + "grad_norm": 30.779329299926758, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.1741, + "step": 4 + }, + { + "epoch": 0.06912442396313365, + "grad_norm": 31.22808837890625, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.1864, + "step": 5 + }, + { + "epoch": 0.08294930875576037, + "grad_norm": 30.783327102661133, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.1993, + "step": 6 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 30.57423210144043, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.1506, + "step": 7 + }, + { + "epoch": 0.11059907834101383, + "grad_norm": 30.952186584472656, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.1599, + "step": 8 + }, + { + "epoch": 0.12442396313364056, + "grad_norm": 30.37245750427246, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.1572, + "step": 9 + }, + { + "epoch": 0.1382488479262673, + "grad_norm": 30.930192947387695, + "learning_rate": 5.000000000000001e-07, + "loss": 2.1447, + "step": 10 + }, + { + "epoch": 0.15207373271889402, + "grad_norm": 29.735448837280273, + "learning_rate": 5.5e-07, + "loss": 2.0742, + "step": 11 + }, + { + "epoch": 0.16589861751152074, + "grad_norm": 29.62826156616211, + "learning_rate": 6.000000000000001e-07, + "loss": 2.061, + "step": 12 + }, + { + "epoch": 0.17972350230414746, + "grad_norm": 28.937463760375977, + "learning_rate": 6.5e-07, + "loss": 1.9974, + "step": 13 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 29.24833106994629, + "learning_rate": 7.000000000000001e-07, + "loss": 1.9833, + "step": 14 + }, + { + "epoch": 0.2073732718894009, + "grad_norm": 28.122018814086914, + "learning_rate": 7.5e-07, + "loss": 1.8934, + "step": 15 + }, + { + "epoch": 0.22119815668202766, + "grad_norm": 28.059659957885742, + "learning_rate": 8.000000000000001e-07, + "loss": 1.875, + "step": 16 + }, + { + "epoch": 0.2350230414746544, + "grad_norm": 27.361961364746094, + "learning_rate": 8.500000000000001e-07, + "loss": 1.8009, + "step": 17 + }, + { + "epoch": 0.2488479262672811, + "grad_norm": 26.721765518188477, + "learning_rate": 9.000000000000001e-07, + "loss": 1.7116, + "step": 18 + }, + { + "epoch": 0.2626728110599078, + "grad_norm": 25.37330436706543, + "learning_rate": 9.500000000000001e-07, + "loss": 1.5608, + "step": 19 + }, + { + "epoch": 0.2764976958525346, + "grad_norm": 25.81206703186035, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5043, + "step": 20 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 25.539344787597656, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.3673, + "step": 21 + }, + { + "epoch": 0.30414746543778803, + "grad_norm": 25.097164154052734, + "learning_rate": 1.1e-06, + "loss": 1.2029, + "step": 22 + }, + { + "epoch": 0.31797235023041476, + "grad_norm": 24.619497299194336, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.0458, + "step": 23 + }, + { + "epoch": 0.3317972350230415, + "grad_norm": 23.820302963256836, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.8723, + "step": 24 + }, + { + "epoch": 0.3456221198156682, + "grad_norm": 23.12735939025879, + "learning_rate": 1.25e-06, + "loss": 0.7183, + "step": 25 + }, + { + "epoch": 0.35944700460829493, + "grad_norm": 20.127134323120117, + "learning_rate": 1.3e-06, + "loss": 0.5248, + "step": 26 + }, + { + "epoch": 0.37327188940092165, + "grad_norm": 15.901495933532715, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.3689, + "step": 27 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 11.053832054138184, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.2482, + "step": 28 + }, + { + "epoch": 0.4009216589861751, + "grad_norm": 7.248495578765869, + "learning_rate": 1.45e-06, + "loss": 0.1847, + "step": 29 + }, + { + "epoch": 0.4147465437788018, + "grad_norm": 5.378540515899658, + "learning_rate": 1.5e-06, + "loss": 0.1423, + "step": 30 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 3.8371808528900146, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1152, + "step": 31 + }, + { + "epoch": 0.4423963133640553, + "grad_norm": 2.2655274868011475, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0845, + "step": 32 + }, + { + "epoch": 0.45622119815668205, + "grad_norm": 1.5746861696243286, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0711, + "step": 33 + }, + { + "epoch": 0.4700460829493088, + "grad_norm": 1.3510947227478027, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0734, + "step": 34 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.9737389087677002, + "learning_rate": 1.75e-06, + "loss": 0.0651, + "step": 35 + }, + { + "epoch": 0.4976958525345622, + "grad_norm": 0.9815284609794617, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0593, + "step": 36 + }, + { + "epoch": 0.511520737327189, + "grad_norm": 0.8567912578582764, + "learning_rate": 1.85e-06, + "loss": 0.0543, + "step": 37 + }, + { + "epoch": 0.5253456221198156, + "grad_norm": 0.6773302555084229, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0622, + "step": 38 + }, + { + "epoch": 0.5391705069124424, + "grad_norm": 0.49936285614967346, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0511, + "step": 39 + }, + { + "epoch": 0.5529953917050692, + "grad_norm": 0.6253588795661926, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0478, + "step": 40 + }, + { + "epoch": 0.5668202764976958, + "grad_norm": 0.5103089809417725, + "learning_rate": 2.05e-06, + "loss": 0.0465, + "step": 41 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.29294702410697937, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0456, + "step": 42 + }, + { + "epoch": 0.5944700460829493, + "grad_norm": 0.4237954616546631, + "learning_rate": 2.15e-06, + "loss": 0.0501, + "step": 43 + }, + { + "epoch": 0.6082949308755761, + "grad_norm": 0.42243412137031555, + "learning_rate": 2.2e-06, + "loss": 0.0388, + "step": 44 + }, + { + "epoch": 0.6221198156682027, + "grad_norm": 0.37881818413734436, + "learning_rate": 2.25e-06, + "loss": 0.0415, + "step": 45 + }, + { + "epoch": 0.6359447004608295, + "grad_norm": 0.4941152036190033, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.045, + "step": 46 + }, + { + "epoch": 0.6497695852534562, + "grad_norm": 0.3046450912952423, + "learning_rate": 2.35e-06, + "loss": 0.0386, + "step": 47 + }, + { + "epoch": 0.663594470046083, + "grad_norm": 0.39361852407455444, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0447, + "step": 48 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.5190001130104065, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0364, + "step": 49 + }, + { + "epoch": 0.6912442396313364, + "grad_norm": 0.372072696685791, + "learning_rate": 2.5e-06, + "loss": 0.043, + "step": 50 + }, + { + "epoch": 0.7050691244239631, + "grad_norm": 0.3756551146507263, + "learning_rate": 2.55e-06, + "loss": 0.0424, + "step": 51 + }, + { + "epoch": 0.7188940092165899, + "grad_norm": 0.4593554437160492, + "learning_rate": 2.6e-06, + "loss": 0.0387, + "step": 52 + }, + { + "epoch": 0.7327188940092166, + "grad_norm": 0.2931855618953705, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0396, + "step": 53 + }, + { + "epoch": 0.7465437788018433, + "grad_norm": 0.38429534435272217, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0373, + "step": 54 + }, + { + "epoch": 0.7603686635944701, + "grad_norm": 0.3506857752799988, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.04, + "step": 55 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.29847028851509094, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0369, + "step": 56 + }, + { + "epoch": 0.7880184331797235, + "grad_norm": 0.3653375506401062, + "learning_rate": 2.85e-06, + "loss": 0.0396, + "step": 57 + }, + { + "epoch": 0.8018433179723502, + "grad_norm": 0.3163083791732788, + "learning_rate": 2.9e-06, + "loss": 0.0337, + "step": 58 + }, + { + "epoch": 0.815668202764977, + "grad_norm": 0.3734363615512848, + "learning_rate": 2.95e-06, + "loss": 0.0327, + "step": 59 + }, + { + "epoch": 0.8294930875576036, + "grad_norm": 0.29547712206840515, + "learning_rate": 3e-06, + "loss": 0.0365, + "step": 60 + }, + { + "epoch": 0.8433179723502304, + "grad_norm": 0.4041007161140442, + "learning_rate": 3.05e-06, + "loss": 0.038, + "step": 61 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.3602149784564972, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.033, + "step": 62 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 0.2948857545852661, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0386, + "step": 63 + }, + { + "epoch": 0.8847926267281107, + "grad_norm": 0.39098358154296875, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0323, + "step": 64 + }, + { + "epoch": 0.8986175115207373, + "grad_norm": 0.3692062795162201, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0309, + "step": 65 + }, + { + "epoch": 0.9124423963133641, + "grad_norm": 0.3967229425907135, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0346, + "step": 66 + }, + { + "epoch": 0.9262672811059908, + "grad_norm": 0.47776708006858826, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0355, + "step": 67 + }, + { + "epoch": 0.9400921658986175, + "grad_norm": 0.21545131504535675, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0294, + "step": 68 + }, + { + "epoch": 0.9539170506912442, + "grad_norm": 0.23738539218902588, + "learning_rate": 3.45e-06, + "loss": 0.0308, + "step": 69 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.29174014925956726, + "learning_rate": 3.5e-06, + "loss": 0.0312, + "step": 70 + }, + { + "epoch": 0.9815668202764977, + "grad_norm": 0.38475602865219116, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0324, + "step": 71 + }, + { + "epoch": 0.9953917050691244, + "grad_norm": 0.4077378809452057, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0297, + "step": 72 + }, + { + "epoch": 1.0, + "grad_norm": 0.4077378809452057, + "learning_rate": 3.65e-06, + "loss": 0.031, + "step": 73 + }, + { + "epoch": 1.0138248847926268, + "grad_norm": 0.46581539511680603, + "learning_rate": 3.7e-06, + "loss": 0.0313, + "step": 74 + }, + { + "epoch": 1.0276497695852536, + "grad_norm": 0.24417200684547424, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.027, + "step": 75 + }, + { + "epoch": 1.0414746543778801, + "grad_norm": 0.20425117015838623, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0307, + "step": 76 + }, + { + "epoch": 1.055299539170507, + "grad_norm": 0.3578161597251892, + "learning_rate": 3.85e-06, + "loss": 0.0312, + "step": 77 + }, + { + "epoch": 1.0691244239631337, + "grad_norm": 0.39486679434776306, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0294, + "step": 78 + }, + { + "epoch": 1.0829493087557605, + "grad_norm": 0.3932795226573944, + "learning_rate": 3.95e-06, + "loss": 0.0307, + "step": 79 + }, + { + "epoch": 1.096774193548387, + "grad_norm": 0.2946235239505768, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0257, + "step": 80 + }, + { + "epoch": 1.1105990783410138, + "grad_norm": 0.3318672776222229, + "learning_rate": 4.05e-06, + "loss": 0.0296, + "step": 81 + }, + { + "epoch": 1.1244239631336406, + "grad_norm": 0.23701588809490204, + "learning_rate": 4.1e-06, + "loss": 0.0298, + "step": 82 + }, + { + "epoch": 1.1382488479262673, + "grad_norm": 0.2415941059589386, + "learning_rate": 4.15e-06, + "loss": 0.0256, + "step": 83 + }, + { + "epoch": 1.1520737327188941, + "grad_norm": 0.24098087847232819, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0263, + "step": 84 + }, + { + "epoch": 1.1658986175115207, + "grad_norm": 0.3530103862285614, + "learning_rate": 4.25e-06, + "loss": 0.0308, + "step": 85 + }, + { + "epoch": 1.1797235023041475, + "grad_norm": 0.2382838875055313, + "learning_rate": 4.3e-06, + "loss": 0.0254, + "step": 86 + }, + { + "epoch": 1.1935483870967742, + "grad_norm": 0.2670588791370392, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0255, + "step": 87 + }, + { + "epoch": 1.2073732718894008, + "grad_norm": 0.30723804235458374, + "learning_rate": 4.4e-06, + "loss": 0.0263, + "step": 88 + }, + { + "epoch": 1.2211981566820276, + "grad_norm": 0.505890965461731, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0265, + "step": 89 + }, + { + "epoch": 1.2350230414746544, + "grad_norm": 0.24307991564273834, + "learning_rate": 4.5e-06, + "loss": 0.0227, + "step": 90 + }, + { + "epoch": 1.2488479262672811, + "grad_norm": 0.2198561429977417, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0261, + "step": 91 + }, + { + "epoch": 1.262672811059908, + "grad_norm": 0.2435183823108673, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0225, + "step": 92 + }, + { + "epoch": 1.2764976958525347, + "grad_norm": 0.18837811052799225, + "learning_rate": 4.65e-06, + "loss": 0.0218, + "step": 93 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 0.3818771541118622, + "learning_rate": 4.7e-06, + "loss": 0.0223, + "step": 94 + }, + { + "epoch": 1.304147465437788, + "grad_norm": 0.2358720600605011, + "learning_rate": 4.75e-06, + "loss": 0.0204, + "step": 95 + }, + { + "epoch": 1.3179723502304148, + "grad_norm": 0.25374144315719604, + "learning_rate": 4.800000000000001e-06, + "loss": 0.022, + "step": 96 + }, + { + "epoch": 1.3317972350230414, + "grad_norm": 0.36181601881980896, + "learning_rate": 4.85e-06, + "loss": 0.0244, + "step": 97 + }, + { + "epoch": 1.3456221198156681, + "grad_norm": 0.3156590759754181, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0233, + "step": 98 + }, + { + "epoch": 1.359447004608295, + "grad_norm": 0.21958638727664948, + "learning_rate": 4.95e-06, + "loss": 0.0218, + "step": 99 + }, + { + "epoch": 1.3732718894009217, + "grad_norm": 0.34455621242523193, + "learning_rate": 5e-06, + "loss": 0.0267, + "step": 100 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 0.283086359500885, + "learning_rate": 4.999888074163108e-06, + "loss": 0.0238, + "step": 101 + }, + { + "epoch": 1.400921658986175, + "grad_norm": 0.28856486082077026, + "learning_rate": 4.999552306674345e-06, + "loss": 0.0186, + "step": 102 + }, + { + "epoch": 1.4147465437788018, + "grad_norm": 0.26721692085266113, + "learning_rate": 4.998992727598557e-06, + "loss": 0.0193, + "step": 103 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.3459971249103546, + "learning_rate": 4.998209387040829e-06, + "loss": 0.0218, + "step": 104 + }, + { + "epoch": 1.4423963133640554, + "grad_norm": 0.25979122519493103, + "learning_rate": 4.9972023551419995e-06, + "loss": 0.0216, + "step": 105 + }, + { + "epoch": 1.456221198156682, + "grad_norm": 0.19960424304008484, + "learning_rate": 4.995971722072379e-06, + "loss": 0.0176, + "step": 106 + }, + { + "epoch": 1.4700460829493087, + "grad_norm": 0.2529441714286804, + "learning_rate": 4.9945175980236745e-06, + "loss": 0.0181, + "step": 107 + }, + { + "epoch": 1.4838709677419355, + "grad_norm": 0.2690267264842987, + "learning_rate": 4.992840113199131e-06, + "loss": 0.0196, + "step": 108 + }, + { + "epoch": 1.4976958525345623, + "grad_norm": 0.3516470789909363, + "learning_rate": 4.990939417801859e-06, + "loss": 0.0182, + "step": 109 + }, + { + "epoch": 1.511520737327189, + "grad_norm": 0.30167508125305176, + "learning_rate": 4.988815682021398e-06, + "loss": 0.0205, + "step": 110 + }, + { + "epoch": 1.5253456221198156, + "grad_norm": 0.3920849859714508, + "learning_rate": 4.986469096018472e-06, + "loss": 0.0177, + "step": 111 + }, + { + "epoch": 1.5391705069124424, + "grad_norm": 0.3274078369140625, + "learning_rate": 4.983899869907963e-06, + "loss": 0.0185, + "step": 112 + }, + { + "epoch": 1.5529953917050692, + "grad_norm": 0.2237282395362854, + "learning_rate": 4.981108233740096e-06, + "loss": 0.016, + "step": 113 + }, + { + "epoch": 1.5668202764976957, + "grad_norm": 0.23966379463672638, + "learning_rate": 4.978094437479843e-06, + "loss": 0.0183, + "step": 114 + }, + { + "epoch": 1.5806451612903225, + "grad_norm": 0.4027673602104187, + "learning_rate": 4.97485875098454e-06, + "loss": 0.0171, + "step": 115 + }, + { + "epoch": 1.5944700460829493, + "grad_norm": 0.24082835018634796, + "learning_rate": 4.971401463979722e-06, + "loss": 0.016, + "step": 116 + }, + { + "epoch": 1.608294930875576, + "grad_norm": 0.19387558102607727, + "learning_rate": 4.967722886033181e-06, + "loss": 0.0165, + "step": 117 + }, + { + "epoch": 1.6221198156682028, + "grad_norm": 0.33696162700653076, + "learning_rate": 4.963823346527249e-06, + "loss": 0.0154, + "step": 118 + }, + { + "epoch": 1.6359447004608296, + "grad_norm": 0.30290740728378296, + "learning_rate": 4.959703194629304e-06, + "loss": 0.0175, + "step": 119 + }, + { + "epoch": 1.6497695852534562, + "grad_norm": 0.3781787157058716, + "learning_rate": 4.955362799260507e-06, + "loss": 0.0145, + "step": 120 + }, + { + "epoch": 1.663594470046083, + "grad_norm": 0.39995357394218445, + "learning_rate": 4.950802549062764e-06, + "loss": 0.015, + "step": 121 + }, + { + "epoch": 1.6774193548387095, + "grad_norm": 0.19926570355892181, + "learning_rate": 4.946022852363932e-06, + "loss": 0.0135, + "step": 122 + }, + { + "epoch": 1.6912442396313363, + "grad_norm": 0.22450515627861023, + "learning_rate": 4.9410241371412525e-06, + "loss": 0.0135, + "step": 123 + }, + { + "epoch": 1.705069124423963, + "grad_norm": 0.3588384985923767, + "learning_rate": 4.935806850983034e-06, + "loss": 0.0125, + "step": 124 + }, + { + "epoch": 1.7188940092165899, + "grad_norm": 0.28571122884750366, + "learning_rate": 4.9303714610485705e-06, + "loss": 0.0166, + "step": 125 + }, + { + "epoch": 1.7327188940092166, + "grad_norm": 0.3496967852115631, + "learning_rate": 4.924718454026318e-06, + "loss": 0.0139, + "step": 126 + }, + { + "epoch": 1.7465437788018434, + "grad_norm": 0.3279854357242584, + "learning_rate": 4.918848336090309e-06, + "loss": 0.0133, + "step": 127 + }, + { + "epoch": 1.7603686635944702, + "grad_norm": 0.19201801717281342, + "learning_rate": 4.912761632854834e-06, + "loss": 0.0151, + "step": 128 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 0.27701929211616516, + "learning_rate": 4.906458889327375e-06, + "loss": 0.0148, + "step": 129 + }, + { + "epoch": 1.7880184331797235, + "grad_norm": 0.2757968008518219, + "learning_rate": 4.899940669859807e-06, + "loss": 0.0118, + "step": 130 + }, + { + "epoch": 1.80184331797235, + "grad_norm": 0.18373191356658936, + "learning_rate": 4.893207558097867e-06, + "loss": 0.0149, + "step": 131 + }, + { + "epoch": 1.8156682027649769, + "grad_norm": 0.2116280496120453, + "learning_rate": 4.8862601569288885e-06, + "loss": 0.0129, + "step": 132 + }, + { + "epoch": 1.8294930875576036, + "grad_norm": 0.30384117364883423, + "learning_rate": 4.879099088427824e-06, + "loss": 0.0136, + "step": 133 + }, + { + "epoch": 1.8433179723502304, + "grad_norm": 0.3766787052154541, + "learning_rate": 4.871724993801541e-06, + "loss": 0.0123, + "step": 134 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.3401263356208801, + "learning_rate": 4.864138533331411e-06, + "loss": 0.0122, + "step": 135 + }, + { + "epoch": 1.870967741935484, + "grad_norm": 0.24321958422660828, + "learning_rate": 4.8563403863141825e-06, + "loss": 0.0123, + "step": 136 + }, + { + "epoch": 1.8847926267281108, + "grad_norm": 0.16918110847473145, + "learning_rate": 4.84833125100116e-06, + "loss": 0.0104, + "step": 137 + }, + { + "epoch": 1.8986175115207373, + "grad_norm": 0.23489230871200562, + "learning_rate": 4.840111844535682e-06, + "loss": 0.0122, + "step": 138 + }, + { + "epoch": 1.912442396313364, + "grad_norm": 0.32796236872673035, + "learning_rate": 4.8316829028889076e-06, + "loss": 0.0109, + "step": 139 + }, + { + "epoch": 1.9262672811059907, + "grad_norm": 0.24210475385189056, + "learning_rate": 4.823045180793914e-06, + "loss": 0.0118, + "step": 140 + }, + { + "epoch": 1.9400921658986174, + "grad_norm": 0.3450548052787781, + "learning_rate": 4.8141994516781196e-06, + "loss": 0.0115, + "step": 141 + }, + { + "epoch": 1.9539170506912442, + "grad_norm": 0.23163923621177673, + "learning_rate": 4.805146507594034e-06, + "loss": 0.0122, + "step": 142 + }, + { + "epoch": 1.967741935483871, + "grad_norm": 0.8197745084762573, + "learning_rate": 4.7958871591483305e-06, + "loss": 0.0101, + "step": 143 + }, + { + "epoch": 1.9815668202764978, + "grad_norm": 0.2917576730251312, + "learning_rate": 4.786422235429269e-06, + "loss": 0.0078, + "step": 144 + }, + { + "epoch": 1.9953917050691246, + "grad_norm": 0.24417108297348022, + "learning_rate": 4.776752583932455e-06, + "loss": 0.0119, + "step": 145 + }, + { + "epoch": 2.0, + "grad_norm": 0.24417108297348022, + "learning_rate": 4.766879070484957e-06, + "loss": 0.0089, + "step": 146 + }, + { + "epoch": 2.013824884792627, + "grad_norm": 0.4215025305747986, + "learning_rate": 4.756802579167781e-06, + "loss": 0.007, + "step": 147 + }, + { + "epoch": 2.0276497695852536, + "grad_norm": 0.2002098709344864, + "learning_rate": 4.746524012236706e-06, + "loss": 0.0078, + "step": 148 + }, + { + "epoch": 2.0414746543778803, + "grad_norm": 0.16432569921016693, + "learning_rate": 4.736044290041496e-06, + "loss": 0.0074, + "step": 149 + }, + { + "epoch": 2.055299539170507, + "grad_norm": 0.2516174018383026, + "learning_rate": 4.725364350943492e-06, + "loss": 0.0067, + "step": 150 + }, + { + "epoch": 2.0691244239631335, + "grad_norm": 0.24242427945137024, + "learning_rate": 4.714485151231593e-06, + "loss": 0.0083, + "step": 151 + }, + { + "epoch": 2.0829493087557602, + "grad_norm": 0.22929197549819946, + "learning_rate": 4.703407665036622e-06, + "loss": 0.0061, + "step": 152 + }, + { + "epoch": 2.096774193548387, + "grad_norm": 0.2929408550262451, + "learning_rate": 4.692132884244113e-06, + "loss": 0.0064, + "step": 153 + }, + { + "epoch": 2.110599078341014, + "grad_norm": 0.22497303783893585, + "learning_rate": 4.680661818405485e-06, + "loss": 0.0061, + "step": 154 + }, + { + "epoch": 2.1244239631336406, + "grad_norm": 0.13698536157608032, + "learning_rate": 4.668995494647653e-06, + "loss": 0.0059, + "step": 155 + }, + { + "epoch": 2.1382488479262673, + "grad_norm": 0.32037150859832764, + "learning_rate": 4.657134957581057e-06, + "loss": 0.0067, + "step": 156 + }, + { + "epoch": 2.152073732718894, + "grad_norm": 0.19389067590236664, + "learning_rate": 4.645081269206128e-06, + "loss": 0.0062, + "step": 157 + }, + { + "epoch": 2.165898617511521, + "grad_norm": 0.2791127562522888, + "learning_rate": 4.632835508818192e-06, + "loss": 0.0058, + "step": 158 + }, + { + "epoch": 2.1797235023041477, + "grad_norm": 0.2178739458322525, + "learning_rate": 4.620398772910833e-06, + "loss": 0.0056, + "step": 159 + }, + { + "epoch": 2.193548387096774, + "grad_norm": 0.29685622453689575, + "learning_rate": 4.607772175077712e-06, + "loss": 0.0055, + "step": 160 + }, + { + "epoch": 2.207373271889401, + "grad_norm": 0.6792906522750854, + "learning_rate": 4.59495684591285e-06, + "loss": 0.0057, + "step": 161 + }, + { + "epoch": 2.2211981566820276, + "grad_norm": 0.17910148203372955, + "learning_rate": 4.581953932909403e-06, + "loss": 0.0046, + "step": 162 + }, + { + "epoch": 2.2350230414746544, + "grad_norm": 0.12593543529510498, + "learning_rate": 4.5687646003569055e-06, + "loss": 0.0046, + "step": 163 + }, + { + "epoch": 2.248847926267281, + "grad_norm": 0.15383680164813995, + "learning_rate": 4.555390029237026e-06, + "loss": 0.0059, + "step": 164 + }, + { + "epoch": 2.262672811059908, + "grad_norm": 0.2324540764093399, + "learning_rate": 4.541831417117815e-06, + "loss": 0.0067, + "step": 165 + }, + { + "epoch": 2.2764976958525347, + "grad_norm": 0.21278905868530273, + "learning_rate": 4.528089978046481e-06, + "loss": 0.0054, + "step": 166 + }, + { + "epoch": 2.2903225806451615, + "grad_norm": 0.2499057948589325, + "learning_rate": 4.514166942440679e-06, + "loss": 0.003, + "step": 167 + }, + { + "epoch": 2.3041474654377883, + "grad_norm": 0.1734611839056015, + "learning_rate": 4.5000635569783365e-06, + "loss": 0.0043, + "step": 168 + }, + { + "epoch": 2.3179723502304146, + "grad_norm": 0.17815802991390228, + "learning_rate": 4.4857810844860325e-06, + "loss": 0.0048, + "step": 169 + }, + { + "epoch": 2.3317972350230414, + "grad_norm": 0.22731409966945648, + "learning_rate": 4.471320803825915e-06, + "loss": 0.0034, + "step": 170 + }, + { + "epoch": 2.345622119815668, + "grad_norm": 0.23811140656471252, + "learning_rate": 4.4566840097811956e-06, + "loss": 0.0029, + "step": 171 + }, + { + "epoch": 2.359447004608295, + "grad_norm": 0.17744024097919464, + "learning_rate": 4.4418720129402145e-06, + "loss": 0.0029, + "step": 172 + }, + { + "epoch": 2.3732718894009217, + "grad_norm": 0.24912229180335999, + "learning_rate": 4.426886139579083e-06, + "loss": 0.0049, + "step": 173 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 0.17039696872234344, + "learning_rate": 4.411727731542937e-06, + "loss": 0.003, + "step": 174 + }, + { + "epoch": 2.4009216589861753, + "grad_norm": 0.3089725375175476, + "learning_rate": 4.39639814612578e-06, + "loss": 0.0034, + "step": 175 + }, + { + "epoch": 2.4147465437788016, + "grad_norm": 0.22647598385810852, + "learning_rate": 4.3808987559489536e-06, + "loss": 0.0058, + "step": 176 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.19015835225582123, + "learning_rate": 4.365230948838232e-06, + "loss": 0.004, + "step": 177 + }, + { + "epoch": 2.442396313364055, + "grad_norm": 0.1825973391532898, + "learning_rate": 4.349396127699552e-06, + "loss": 0.0032, + "step": 178 + }, + { + "epoch": 2.456221198156682, + "grad_norm": 0.15705449879169464, + "learning_rate": 4.3333957103934025e-06, + "loss": 0.0035, + "step": 179 + }, + { + "epoch": 2.4700460829493087, + "grad_norm": 0.19110225141048431, + "learning_rate": 4.317231129607859e-06, + "loss": 0.0019, + "step": 180 + }, + { + "epoch": 2.4838709677419355, + "grad_norm": 0.1481270045042038, + "learning_rate": 4.30090383273031e-06, + "loss": 0.0035, + "step": 181 + }, + { + "epoch": 2.4976958525345623, + "grad_norm": 0.19533571600914001, + "learning_rate": 4.2844152817178476e-06, + "loss": 0.0023, + "step": 182 + }, + { + "epoch": 2.511520737327189, + "grad_norm": 0.1991293579339981, + "learning_rate": 4.267766952966369e-06, + "loss": 0.0025, + "step": 183 + }, + { + "epoch": 2.525345622119816, + "grad_norm": 0.22637878358364105, + "learning_rate": 4.2509603371783776e-06, + "loss": 0.0029, + "step": 184 + }, + { + "epoch": 2.539170506912442, + "grad_norm": 0.21984712779521942, + "learning_rate": 4.233996939229502e-06, + "loss": 0.0035, + "step": 185 + }, + { + "epoch": 2.5529953917050694, + "grad_norm": 0.25706061720848083, + "learning_rate": 4.216878278033753e-06, + "loss": 0.0033, + "step": 186 + }, + { + "epoch": 2.5668202764976957, + "grad_norm": 0.224118173122406, + "learning_rate": 4.199605886407515e-06, + "loss": 0.0017, + "step": 187 + }, + { + "epoch": 2.5806451612903225, + "grad_norm": 0.0781751424074173, + "learning_rate": 4.1821813109322975e-06, + "loss": 0.002, + "step": 188 + }, + { + "epoch": 2.5944700460829493, + "grad_norm": 0.2209765911102295, + "learning_rate": 4.164606111816256e-06, + "loss": 0.0018, + "step": 189 + }, + { + "epoch": 2.608294930875576, + "grad_norm": 0.12815824151039124, + "learning_rate": 4.146881862754485e-06, + "loss": 0.003, + "step": 190 + }, + { + "epoch": 2.622119815668203, + "grad_norm": 0.3006991147994995, + "learning_rate": 4.129010150788112e-06, + "loss": 0.0022, + "step": 191 + }, + { + "epoch": 2.6359447004608296, + "grad_norm": 0.19085584580898285, + "learning_rate": 4.110992576162193e-06, + "loss": 0.0026, + "step": 192 + }, + { + "epoch": 2.6497695852534564, + "grad_norm": 0.13027659058570862, + "learning_rate": 4.092830752182423e-06, + "loss": 0.0015, + "step": 193 + }, + { + "epoch": 2.6635944700460827, + "grad_norm": 0.16998590528964996, + "learning_rate": 4.074526305070679e-06, + "loss": 0.0018, + "step": 194 + }, + { + "epoch": 2.6774193548387095, + "grad_norm": 0.1743537187576294, + "learning_rate": 4.056080873819412e-06, + "loss": 0.0022, + "step": 195 + }, + { + "epoch": 2.6912442396313363, + "grad_norm": 0.3566405177116394, + "learning_rate": 4.037496110044885e-06, + "loss": 0.0018, + "step": 196 + }, + { + "epoch": 2.705069124423963, + "grad_norm": 0.274739146232605, + "learning_rate": 4.018773677839289e-06, + "loss": 0.0012, + "step": 197 + }, + { + "epoch": 2.71889400921659, + "grad_norm": 0.12038746476173401, + "learning_rate": 3.999915253621739e-06, + "loss": 0.0013, + "step": 198 + }, + { + "epoch": 2.7327188940092166, + "grad_norm": 0.12693172693252563, + "learning_rate": 3.980922525988167e-06, + "loss": 0.0017, + "step": 199 + }, + { + "epoch": 2.7465437788018434, + "grad_norm": 0.11907753348350525, + "learning_rate": 3.961797195560118e-06, + "loss": 0.001, + "step": 200 + }, + { + "epoch": 2.76036866359447, + "grad_norm": 0.1901165395975113, + "learning_rate": 3.942540974832486e-06, + "loss": 0.0028, + "step": 201 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 0.2039843052625656, + "learning_rate": 3.9231555880201655e-06, + "loss": 0.0011, + "step": 202 + }, + { + "epoch": 2.7880184331797233, + "grad_norm": 0.16181506216526031, + "learning_rate": 3.903642770903671e-06, + "loss": 0.003, + "step": 203 + }, + { + "epoch": 2.80184331797235, + "grad_norm": 0.13345211744308472, + "learning_rate": 3.884004270673711e-06, + "loss": 0.0023, + "step": 204 + }, + { + "epoch": 2.815668202764977, + "grad_norm": 0.19453725218772888, + "learning_rate": 3.864241845774746e-06, + "loss": 0.001, + "step": 205 + }, + { + "epoch": 2.8294930875576036, + "grad_norm": 0.18157535791397095, + "learning_rate": 3.844357265747531e-06, + "loss": 0.0029, + "step": 206 + }, + { + "epoch": 2.8433179723502304, + "grad_norm": 0.17876467108726501, + "learning_rate": 3.8243523110706736e-06, + "loss": 0.0018, + "step": 207 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.13000421226024628, + "learning_rate": 3.8042287730012117e-06, + "loss": 0.0011, + "step": 208 + }, + { + "epoch": 2.870967741935484, + "grad_norm": 0.08808371424674988, + "learning_rate": 3.7839884534142157e-06, + "loss": 0.0018, + "step": 209 + }, + { + "epoch": 2.8847926267281108, + "grad_norm": 0.32318148016929626, + "learning_rate": 3.7636331646414524e-06, + "loss": 0.0012, + "step": 210 + }, + { + "epoch": 2.8986175115207375, + "grad_norm": 0.1259954422712326, + "learning_rate": 3.7431647293091076e-06, + "loss": 0.0012, + "step": 211 + }, + { + "epoch": 2.912442396313364, + "grad_norm": 0.1344563215970993, + "learning_rate": 3.7225849801745835e-06, + "loss": 0.0006, + "step": 212 + }, + { + "epoch": 2.9262672811059907, + "grad_norm": 0.09105626493692398, + "learning_rate": 3.701895759962397e-06, + "loss": 0.0007, + "step": 213 + }, + { + "epoch": 2.9400921658986174, + "grad_norm": 0.11718853563070297, + "learning_rate": 3.6810989211991777e-06, + "loss": 0.0022, + "step": 214 + }, + { + "epoch": 2.953917050691244, + "grad_norm": 0.10988112539052963, + "learning_rate": 3.6601963260477923e-06, + "loss": 0.0007, + "step": 215 + }, + { + "epoch": 2.967741935483871, + "grad_norm": 0.12010538578033447, + "learning_rate": 3.6391898461406045e-06, + "loss": 0.0014, + "step": 216 + }, + { + "epoch": 2.9815668202764978, + "grad_norm": 0.12934529781341553, + "learning_rate": 3.6180813624118898e-06, + "loss": 0.001, + "step": 217 + }, + { + "epoch": 2.9953917050691246, + "grad_norm": 0.05664035677909851, + "learning_rate": 3.5968727649294134e-06, + "loss": 0.0002, + "step": 218 + }, + { + "epoch": 3.0, + "grad_norm": 0.07633747160434723, + "learning_rate": 3.575565952725193e-06, + "loss": 0.0002, + "step": 219 + }, + { + "epoch": 3.013824884792627, + "grad_norm": 0.16964735090732574, + "learning_rate": 3.55416283362546e-06, + "loss": 0.0005, + "step": 220 + }, + { + "epoch": 3.0276497695852536, + "grad_norm": 0.03826030716300011, + "learning_rate": 3.5326653240798283e-06, + "loss": 0.0003, + "step": 221 + }, + { + "epoch": 3.0414746543778803, + "grad_norm": 0.05900357663631439, + "learning_rate": 3.5110753489896924e-06, + "loss": 0.0006, + "step": 222 + }, + { + "epoch": 3.055299539170507, + "grad_norm": 0.06874338537454605, + "learning_rate": 3.4893948415358803e-06, + "loss": 0.0002, + "step": 223 + }, + { + "epoch": 3.0691244239631335, + "grad_norm": 0.10445930808782578, + "learning_rate": 3.4676257430055438e-06, + "loss": 0.0011, + "step": 224 + }, + { + "epoch": 3.0829493087557602, + "grad_norm": 0.03757224604487419, + "learning_rate": 3.4457700026183378e-06, + "loss": 0.0002, + "step": 225 + }, + { + "epoch": 3.096774193548387, + "grad_norm": 0.2678232491016388, + "learning_rate": 3.4238295773518924e-06, + "loss": 0.0012, + "step": 226 + }, + { + "epoch": 3.110599078341014, + "grad_norm": 0.11278262734413147, + "learning_rate": 3.4018064317665745e-06, + "loss": 0.0003, + "step": 227 + }, + { + "epoch": 3.1244239631336406, + "grad_norm": 0.03823389112949371, + "learning_rate": 3.3797025378295826e-06, + "loss": 0.0002, + "step": 228 + }, + { + "epoch": 3.1382488479262673, + "grad_norm": 0.015309945680201054, + "learning_rate": 3.357519874738382e-06, + "loss": 0.0, + "step": 229 + }, + { + "epoch": 3.152073732718894, + "grad_norm": 0.12372211366891861, + "learning_rate": 3.3352604287434752e-06, + "loss": 0.0007, + "step": 230 + }, + { + "epoch": 3.165898617511521, + "grad_norm": 0.062292926013469696, + "learning_rate": 3.31292619297056e-06, + "loss": 0.0003, + "step": 231 + }, + { + "epoch": 3.1797235023041477, + "grad_norm": 0.02390543930232525, + "learning_rate": 3.29051916724206e-06, + "loss": 0.0001, + "step": 232 + }, + { + "epoch": 3.193548387096774, + "grad_norm": 0.035650208592414856, + "learning_rate": 3.2680413578980623e-06, + "loss": 0.0001, + "step": 233 + }, + { + "epoch": 3.207373271889401, + "grad_norm": 0.04304494708776474, + "learning_rate": 3.245494777616664e-06, + "loss": 0.0001, + "step": 234 + }, + { + "epoch": 3.2211981566820276, + "grad_norm": 0.07038014382123947, + "learning_rate": 3.2228814452337587e-06, + "loss": 0.0005, + "step": 235 + }, + { + "epoch": 3.2350230414746544, + "grad_norm": 0.18309231102466583, + "learning_rate": 3.2002033855622683e-06, + "loss": 0.0005, + "step": 236 + }, + { + "epoch": 3.248847926267281, + "grad_norm": 0.04949740692973137, + "learning_rate": 3.177462629210838e-06, + "loss": 0.0002, + "step": 237 + }, + { + "epoch": 3.262672811059908, + "grad_norm": 0.0319606214761734, + "learning_rate": 3.154661212402017e-06, + "loss": 0.0001, + "step": 238 + }, + { + "epoch": 3.2764976958525347, + "grad_norm": 0.062357429414987564, + "learning_rate": 3.131801176789934e-06, + "loss": 0.0003, + "step": 239 + }, + { + "epoch": 3.2903225806451615, + "grad_norm": 0.060603659600019455, + "learning_rate": 3.1088845692774798e-06, + "loss": 0.0003, + "step": 240 + }, + { + "epoch": 3.3041474654377883, + "grad_norm": 0.12379086762666702, + "learning_rate": 3.0859134418330373e-06, + "loss": 0.0013, + "step": 241 + }, + { + "epoch": 3.3179723502304146, + "grad_norm": 0.028559578582644463, + "learning_rate": 3.0628898513067357e-06, + "loss": 0.0002, + "step": 242 + }, + { + "epoch": 3.3317972350230414, + "grad_norm": 0.04983198642730713, + "learning_rate": 3.0398158592462847e-06, + "loss": 0.0001, + "step": 243 + }, + { + "epoch": 3.345622119815668, + "grad_norm": 0.07023701816797256, + "learning_rate": 3.0166935317123824e-06, + "loss": 0.0009, + "step": 244 + }, + { + "epoch": 3.359447004608295, + "grad_norm": 0.046779777854681015, + "learning_rate": 2.9935249390937184e-06, + "loss": 0.0002, + "step": 245 + }, + { + "epoch": 3.3732718894009217, + "grad_norm": 0.07187545299530029, + "learning_rate": 2.970312155921585e-06, + "loss": 0.0006, + "step": 246 + }, + { + "epoch": 3.3870967741935485, + "grad_norm": 0.019256649538874626, + "learning_rate": 2.9470572606841295e-06, + "loss": 0.0001, + "step": 247 + }, + { + "epoch": 3.4009216589861753, + "grad_norm": 0.0477205291390419, + "learning_rate": 2.9237623356402423e-06, + "loss": 0.0002, + "step": 248 + }, + { + "epoch": 3.4147465437788016, + "grad_norm": 0.06807561218738556, + "learning_rate": 2.900429466633107e-06, + "loss": 0.0003, + "step": 249 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.1516796499490738, + "learning_rate": 2.8770607429034352e-06, + "loss": 0.0006, + "step": 250 + }, + { + "epoch": 3.442396313364055, + "grad_norm": 0.045213282108306885, + "learning_rate": 2.8536582569023964e-06, + "loss": 0.0001, + "step": 251 + }, + { + "epoch": 3.456221198156682, + "grad_norm": 0.020110802724957466, + "learning_rate": 2.8302241041042564e-06, + "loss": 0.0001, + "step": 252 + }, + { + "epoch": 3.4700460829493087, + "grad_norm": 0.03242102265357971, + "learning_rate": 2.8067603828187446e-06, + "loss": 0.0002, + "step": 253 + }, + { + "epoch": 3.4838709677419355, + "grad_norm": 0.039639636874198914, + "learning_rate": 2.7832691940031755e-06, + "loss": 0.0001, + "step": 254 + }, + { + "epoch": 3.4976958525345623, + "grad_norm": 0.06943561136722565, + "learning_rate": 2.759752641074322e-06, + "loss": 0.0003, + "step": 255 + }, + { + "epoch": 3.511520737327189, + "grad_norm": 0.02593497559428215, + "learning_rate": 2.7362128297200784e-06, + "loss": 0.0001, + "step": 256 + }, + { + "epoch": 3.525345622119816, + "grad_norm": 0.02811415307223797, + "learning_rate": 2.712651867710914e-06, + "loss": 0.0002, + "step": 257 + }, + { + "epoch": 3.539170506912442, + "grad_norm": 0.07381757348775864, + "learning_rate": 2.6890718647111424e-06, + "loss": 0.0003, + "step": 258 + }, + { + "epoch": 3.5529953917050694, + "grad_norm": 0.014391073025763035, + "learning_rate": 2.665474932090017e-06, + "loss": 0.0001, + "step": 259 + }, + { + "epoch": 3.5668202764976957, + "grad_norm": 0.027200503274798393, + "learning_rate": 2.6418631827326857e-06, + "loss": 0.0001, + "step": 260 + }, + { + "epoch": 3.5806451612903225, + "grad_norm": 0.02720312774181366, + "learning_rate": 2.6182387308509927e-06, + "loss": 0.0001, + "step": 261 + }, + { + "epoch": 3.5944700460829493, + "grad_norm": 0.04352420195937157, + "learning_rate": 2.5946036917941765e-06, + "loss": 0.0004, + "step": 262 + }, + { + "epoch": 3.608294930875576, + "grad_norm": 0.03459783270955086, + "learning_rate": 2.570960181859458e-06, + "loss": 0.0001, + "step": 263 + }, + { + "epoch": 3.622119815668203, + "grad_norm": 0.03097033128142357, + "learning_rate": 2.547310318102548e-06, + "loss": 0.0001, + "step": 264 + }, + { + "epoch": 3.6359447004608296, + "grad_norm": 0.0076720695942640305, + "learning_rate": 2.5236562181480794e-06, + "loss": 0.0, + "step": 265 + }, + { + "epoch": 3.6497695852534564, + "grad_norm": 0.023994900286197662, + "learning_rate": 2.5e-06, + "loss": 0.0001, + "step": 266 + }, + { + "epoch": 3.6635944700460827, + "grad_norm": 0.005682840943336487, + "learning_rate": 2.4763437818519205e-06, + "loss": 0.0, + "step": 267 + }, + { + "epoch": 3.6774193548387095, + "grad_norm": 0.030443254858255386, + "learning_rate": 2.4526896818974534e-06, + "loss": 0.0002, + "step": 268 + }, + { + "epoch": 3.6912442396313363, + "grad_norm": 0.008863283321261406, + "learning_rate": 2.429039818140543e-06, + "loss": 0.0, + "step": 269 + }, + { + "epoch": 3.705069124423963, + "grad_norm": 0.009775679558515549, + "learning_rate": 2.405396308205825e-06, + "loss": 0.0001, + "step": 270 + }, + { + "epoch": 3.71889400921659, + "grad_norm": 0.019227130338549614, + "learning_rate": 2.381761269149009e-06, + "loss": 0.0001, + "step": 271 + }, + { + "epoch": 3.7327188940092166, + "grad_norm": 0.037880923599004745, + "learning_rate": 2.358136817267315e-06, + "loss": 0.0002, + "step": 272 + }, + { + "epoch": 3.7465437788018434, + "grad_norm": 0.006014773156493902, + "learning_rate": 2.334525067909983e-06, + "loss": 0.0, + "step": 273 + }, + { + "epoch": 3.76036866359447, + "grad_norm": 0.024770596995949745, + "learning_rate": 2.3109281352888593e-06, + "loss": 0.0001, + "step": 274 + }, + { + "epoch": 3.774193548387097, + "grad_norm": 0.008392867632210255, + "learning_rate": 2.2873481322890866e-06, + "loss": 0.0, + "step": 275 + }, + { + "epoch": 3.7880184331797233, + "grad_norm": 0.030915284529328346, + "learning_rate": 2.263787170279922e-06, + "loss": 0.0002, + "step": 276 + }, + { + "epoch": 3.80184331797235, + "grad_norm": 0.04161324352025986, + "learning_rate": 2.2402473589256793e-06, + "loss": 0.0002, + "step": 277 + }, + { + "epoch": 3.815668202764977, + "grad_norm": 0.04104781523346901, + "learning_rate": 2.2167308059968258e-06, + "loss": 0.0001, + "step": 278 + }, + { + "epoch": 3.8294930875576036, + "grad_norm": 0.02981358952820301, + "learning_rate": 2.193239617181256e-06, + "loss": 0.0002, + "step": 279 + }, + { + "epoch": 3.8433179723502304, + "grad_norm": 0.03616194799542427, + "learning_rate": 2.169775895895745e-06, + "loss": 0.0002, + "step": 280 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.003307241713628173, + "learning_rate": 2.146341743097604e-06, + "loss": 0.0, + "step": 281 + }, + { + "epoch": 3.870967741935484, + "grad_norm": 0.023682212457060814, + "learning_rate": 2.1229392570965656e-06, + "loss": 0.0002, + "step": 282 + }, + { + "epoch": 3.8847926267281108, + "grad_norm": 0.08077914267778397, + "learning_rate": 2.0995705333668948e-06, + "loss": 0.0006, + "step": 283 + }, + { + "epoch": 3.8986175115207375, + "grad_norm": 0.012258109636604786, + "learning_rate": 2.0762376643597586e-06, + "loss": 0.0001, + "step": 284 + }, + { + "epoch": 3.912442396313364, + "grad_norm": 0.012420260347425938, + "learning_rate": 2.0529427393158704e-06, + "loss": 0.0001, + "step": 285 + }, + { + "epoch": 3.9262672811059907, + "grad_norm": 0.03773212060332298, + "learning_rate": 2.0296878440784164e-06, + "loss": 0.0001, + "step": 286 + }, + { + "epoch": 3.9400921658986174, + "grad_norm": 0.03834720700979233, + "learning_rate": 2.006475060906283e-06, + "loss": 0.0001, + "step": 287 + }, + { + "epoch": 3.953917050691244, + "grad_norm": 0.06677021831274033, + "learning_rate": 1.9833064682876175e-06, + "loss": 0.0001, + "step": 288 + }, + { + "epoch": 3.967741935483871, + "grad_norm": 0.010336378589272499, + "learning_rate": 1.9601841407537157e-06, + "loss": 0.0, + "step": 289 + }, + { + "epoch": 3.9815668202764978, + "grad_norm": 0.003101527225226164, + "learning_rate": 1.937110148693265e-06, + "loss": 0.0, + "step": 290 + }, + { + "epoch": 3.9953917050691246, + "grad_norm": 0.033553846180438995, + "learning_rate": 1.9140865581669627e-06, + "loss": 0.0001, + "step": 291 + }, + { + "epoch": 4.0, + "grad_norm": 0.033553846180438995, + "learning_rate": 1.8911154307225204e-06, + "loss": 0.0001, + "step": 292 + }, + { + "epoch": 4.013824884792626, + "grad_norm": 0.050998423248529434, + "learning_rate": 1.8681988232100674e-06, + "loss": 0.0, + "step": 293 + }, + { + "epoch": 4.027649769585254, + "grad_norm": 0.016056543216109276, + "learning_rate": 1.8453387875979834e-06, + "loss": 0.0, + "step": 294 + }, + { + "epoch": 4.04147465437788, + "grad_norm": 0.003037769114598632, + "learning_rate": 1.822537370789163e-06, + "loss": 0.0, + "step": 295 + }, + { + "epoch": 4.055299539170507, + "grad_norm": 0.01739078015089035, + "learning_rate": 1.7997966144377328e-06, + "loss": 0.0001, + "step": 296 + }, + { + "epoch": 4.0691244239631335, + "grad_norm": 0.012232556939125061, + "learning_rate": 1.7771185547662417e-06, + "loss": 0.0, + "step": 297 + }, + { + "epoch": 4.082949308755761, + "grad_norm": 0.0021200303453952074, + "learning_rate": 1.754505222383337e-06, + "loss": 0.0, + "step": 298 + }, + { + "epoch": 4.096774193548387, + "grad_norm": 0.0028847770299762487, + "learning_rate": 1.7319586421019383e-06, + "loss": 0.0, + "step": 299 + }, + { + "epoch": 4.110599078341014, + "grad_norm": 0.02263806015253067, + "learning_rate": 1.7094808327579401e-06, + "loss": 0.0001, + "step": 300 + }, + { + "epoch": 4.124423963133641, + "grad_norm": 0.008537916466593742, + "learning_rate": 1.6870738070294412e-06, + "loss": 0.0, + "step": 301 + }, + { + "epoch": 4.138248847926267, + "grad_norm": 0.0025778906419873238, + "learning_rate": 1.6647395712565256e-06, + "loss": 0.0, + "step": 302 + }, + { + "epoch": 4.152073732718894, + "grad_norm": 0.008922383189201355, + "learning_rate": 1.6424801252616186e-06, + "loss": 0.0, + "step": 303 + }, + { + "epoch": 4.1658986175115205, + "grad_norm": 0.008659109473228455, + "learning_rate": 1.6202974621704176e-06, + "loss": 0.0, + "step": 304 + }, + { + "epoch": 4.179723502304148, + "grad_norm": 0.006877637468278408, + "learning_rate": 1.5981935682334266e-06, + "loss": 0.0001, + "step": 305 + }, + { + "epoch": 4.193548387096774, + "grad_norm": 0.00891530979424715, + "learning_rate": 1.5761704226481078e-06, + "loss": 0.0001, + "step": 306 + }, + { + "epoch": 4.207373271889401, + "grad_norm": 0.005250364542007446, + "learning_rate": 1.5542299973816626e-06, + "loss": 0.0, + "step": 307 + }, + { + "epoch": 4.221198156682028, + "grad_norm": 0.005915890447795391, + "learning_rate": 1.5323742569944573e-06, + "loss": 0.0, + "step": 308 + }, + { + "epoch": 4.235023041474655, + "grad_norm": 0.008372033014893532, + "learning_rate": 1.5106051584641208e-06, + "loss": 0.0, + "step": 309 + }, + { + "epoch": 4.248847926267281, + "grad_norm": 0.0033532341476529837, + "learning_rate": 1.4889246510103078e-06, + "loss": 0.0, + "step": 310 + }, + { + "epoch": 4.2626728110599075, + "grad_norm": 0.017146175727248192, + "learning_rate": 1.4673346759201728e-06, + "loss": 0.0001, + "step": 311 + }, + { + "epoch": 4.276497695852535, + "grad_norm": 0.010326577350497246, + "learning_rate": 1.44583716637454e-06, + "loss": 0.0, + "step": 312 + }, + { + "epoch": 4.290322580645161, + "grad_norm": 0.0025458873715251684, + "learning_rate": 1.4244340472748076e-06, + "loss": 0.0, + "step": 313 + }, + { + "epoch": 4.304147465437788, + "grad_norm": 0.0022526225075125694, + "learning_rate": 1.403127235070587e-06, + "loss": 0.0, + "step": 314 + }, + { + "epoch": 4.317972350230415, + "grad_norm": 0.029883896932005882, + "learning_rate": 1.381918637588112e-06, + "loss": 0.0001, + "step": 315 + }, + { + "epoch": 4.331797235023042, + "grad_norm": 0.005568996071815491, + "learning_rate": 1.3608101538593965e-06, + "loss": 0.0, + "step": 316 + }, + { + "epoch": 4.345622119815668, + "grad_norm": 0.005829329136759043, + "learning_rate": 1.3398036739522088e-06, + "loss": 0.0, + "step": 317 + }, + { + "epoch": 4.359447004608295, + "grad_norm": 0.010301432572305202, + "learning_rate": 1.3189010788008234e-06, + "loss": 0.0, + "step": 318 + }, + { + "epoch": 4.373271889400922, + "grad_norm": 0.0017158942064270377, + "learning_rate": 1.2981042400376032e-06, + "loss": 0.0, + "step": 319 + }, + { + "epoch": 4.387096774193548, + "grad_norm": 0.00932268425822258, + "learning_rate": 1.277415019825417e-06, + "loss": 0.0, + "step": 320 + }, + { + "epoch": 4.400921658986175, + "grad_norm": 0.002834129147231579, + "learning_rate": 1.2568352706908937e-06, + "loss": 0.0, + "step": 321 + }, + { + "epoch": 4.414746543778802, + "grad_norm": 0.002881410764530301, + "learning_rate": 1.2363668353585486e-06, + "loss": 0.0, + "step": 322 + }, + { + "epoch": 4.428571428571429, + "grad_norm": 0.01270334329456091, + "learning_rate": 1.216011546585785e-06, + "loss": 0.0, + "step": 323 + }, + { + "epoch": 4.442396313364055, + "grad_norm": 0.010466398671269417, + "learning_rate": 1.195771226998789e-06, + "loss": 0.0, + "step": 324 + }, + { + "epoch": 4.456221198156682, + "grad_norm": 0.00680310744792223, + "learning_rate": 1.1756476889293269e-06, + "loss": 0.0001, + "step": 325 + }, + { + "epoch": 4.470046082949309, + "grad_norm": 0.012853645719587803, + "learning_rate": 1.1556427342524698e-06, + "loss": 0.0, + "step": 326 + }, + { + "epoch": 4.483870967741936, + "grad_norm": 0.009427334181964397, + "learning_rate": 1.1357581542252555e-06, + "loss": 0.0, + "step": 327 + }, + { + "epoch": 4.497695852534562, + "grad_norm": 0.002769877901300788, + "learning_rate": 1.1159957293262888e-06, + "loss": 0.0, + "step": 328 + }, + { + "epoch": 4.511520737327189, + "grad_norm": 0.003061062190681696, + "learning_rate": 1.0963572290963298e-06, + "loss": 0.0, + "step": 329 + }, + { + "epoch": 4.525345622119816, + "grad_norm": 0.02316022664308548, + "learning_rate": 1.0768444119798357e-06, + "loss": 0.0001, + "step": 330 + }, + { + "epoch": 4.539170506912442, + "grad_norm": 0.005801917053759098, + "learning_rate": 1.0574590251675145e-06, + "loss": 0.0, + "step": 331 + }, + { + "epoch": 4.552995391705069, + "grad_norm": 0.006601040717214346, + "learning_rate": 1.0382028044398823e-06, + "loss": 0.0, + "step": 332 + }, + { + "epoch": 4.566820276497696, + "grad_norm": 0.0029323461931198835, + "learning_rate": 1.0190774740118343e-06, + "loss": 0.0, + "step": 333 + }, + { + "epoch": 4.580645161290323, + "grad_norm": 0.02404218353331089, + "learning_rate": 1.0000847463782615e-06, + "loss": 0.0, + "step": 334 + }, + { + "epoch": 4.594470046082949, + "grad_norm": 0.0017514690989628434, + "learning_rate": 9.812263221607114e-07, + "loss": 0.0, + "step": 335 + }, + { + "epoch": 4.6082949308755765, + "grad_norm": 0.006374394986778498, + "learning_rate": 9.625038899551162e-07, + "loss": 0.0, + "step": 336 + }, + { + "epoch": 4.622119815668203, + "grad_norm": 0.009410557337105274, + "learning_rate": 9.439191261805894e-07, + "loss": 0.0001, + "step": 337 + }, + { + "epoch": 4.635944700460829, + "grad_norm": 0.006760374642908573, + "learning_rate": 9.254736949293216e-07, + "loss": 0.0, + "step": 338 + }, + { + "epoch": 4.649769585253456, + "grad_norm": 0.002450983738526702, + "learning_rate": 9.07169247817579e-07, + "loss": 0.0, + "step": 339 + }, + { + "epoch": 4.663594470046083, + "grad_norm": 0.006942540407180786, + "learning_rate": 8.890074238378074e-07, + "loss": 0.0, + "step": 340 + }, + { + "epoch": 4.67741935483871, + "grad_norm": 0.0011738522443920374, + "learning_rate": 8.709898492118885e-07, + "loss": 0.0, + "step": 341 + }, + { + "epoch": 4.691244239631336, + "grad_norm": 0.002247450640425086, + "learning_rate": 8.531181372455161e-07, + "loss": 0.0, + "step": 342 + }, + { + "epoch": 4.705069124423963, + "grad_norm": 0.007639207877218723, + "learning_rate": 8.353938881837445e-07, + "loss": 0.0, + "step": 343 + }, + { + "epoch": 4.71889400921659, + "grad_norm": 0.03200659900903702, + "learning_rate": 8.178186890677029e-07, + "loss": 0.0001, + "step": 344 + }, + { + "epoch": 4.732718894009217, + "grad_norm": 0.006035828962922096, + "learning_rate": 8.003941135924859e-07, + "loss": 0.0, + "step": 345 + }, + { + "epoch": 4.746543778801843, + "grad_norm": 0.008785420097410679, + "learning_rate": 7.83121721966248e-07, + "loss": 0.0, + "step": 346 + }, + { + "epoch": 4.76036866359447, + "grad_norm": 0.0018518904689699411, + "learning_rate": 7.66003060770498e-07, + "loss": 0.0, + "step": 347 + }, + { + "epoch": 4.774193548387097, + "grad_norm": 0.0026455053593963385, + "learning_rate": 7.490396628216237e-07, + "loss": 0.0, + "step": 348 + }, + { + "epoch": 4.788018433179723, + "grad_norm": 0.011131388135254383, + "learning_rate": 7.322330470336314e-07, + "loss": 0.0, + "step": 349 + }, + { + "epoch": 4.8018433179723505, + "grad_norm": 0.00781510304659605, + "learning_rate": 7.155847182821524e-07, + "loss": 0.0, + "step": 350 + }, + { + "epoch": 4.815668202764977, + "grad_norm": 0.007307104766368866, + "learning_rate": 6.990961672696908e-07, + "loss": 0.0, + "step": 351 + }, + { + "epoch": 4.829493087557603, + "grad_norm": 0.007883993908762932, + "learning_rate": 6.827688703921407e-07, + "loss": 0.0, + "step": 352 + }, + { + "epoch": 4.84331797235023, + "grad_norm": 0.029551049694418907, + "learning_rate": 6.666042896065983e-07, + "loss": 0.0, + "step": 353 + }, + { + "epoch": 4.857142857142857, + "grad_norm": 0.00934284646064043, + "learning_rate": 6.506038723004484e-07, + "loss": 0.0, + "step": 354 + }, + { + "epoch": 4.870967741935484, + "grad_norm": 0.003877162467688322, + "learning_rate": 6.347690511617693e-07, + "loss": 0.0, + "step": 355 + }, + { + "epoch": 4.88479262672811, + "grad_norm": 0.005847269669175148, + "learning_rate": 6.191012440510469e-07, + "loss": 0.0, + "step": 356 + }, + { + "epoch": 4.8986175115207375, + "grad_norm": 0.001081401132978499, + "learning_rate": 6.036018538742208e-07, + "loss": 0.0, + "step": 357 + }, + { + "epoch": 4.912442396313364, + "grad_norm": 0.002679765224456787, + "learning_rate": 5.882722684570638e-07, + "loss": 0.0, + "step": 358 + }, + { + "epoch": 4.926267281105991, + "grad_norm": 0.011573799885809422, + "learning_rate": 5.731138604209169e-07, + "loss": 0.0, + "step": 359 + }, + { + "epoch": 4.940092165898617, + "grad_norm": 0.0016941127832978964, + "learning_rate": 5.581279870597866e-07, + "loss": 0.0, + "step": 360 + } + ], + "logging_steps": 1, + "max_steps": 432, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 72, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.298790820660537e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-360/training_args.bin b/checkpoint-360/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..66c881ded23f97eecfbd08abb955a7188907de16 --- /dev/null +++ b/checkpoint-360/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bccf4859d2ee74fd992386ba4a70a4c4fc6d0da061af69465c1db71ce0f24882 +size 7928 diff --git a/checkpoint-360/zero_to_fp32.py b/checkpoint-360/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-360/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-72/README.md b/checkpoint-72/README.md new file mode 100644 index 0000000000000000000000000000000000000000..037e1a543b9c1891b5c6981f89d5b7c7c9a907ae --- /dev/null +++ b/checkpoint-72/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-70B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.15.0 \ No newline at end of file diff --git a/checkpoint-72/adapter_config.json b/checkpoint-72/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3295690c20dff5dc1d1f30f8500f0efb7e255838 --- /dev/null +++ b/checkpoint-72/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-70B-Instruct", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "gate_proj", + "v_proj", + "up_proj", + "k_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-72/adapter_model.safetensors b/checkpoint-72/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c504a6ddef5eb1422de4fc1221d121e81f24b73d --- /dev/null +++ b/checkpoint-72/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e1645bb0023c2032caef388f843f839a8223a32ab210e8a590b9bcec1f113ab +size 10829849744 diff --git a/checkpoint-72/global_step72/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-72/global_step72/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..df24d0c0650b5e65eb515b8274ebb82fb043c933 --- /dev/null +++ b/checkpoint-72/global_step72/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ae61a1291a71000ee98ffa114a58a98caa5611a049963237859181150dbcf44 +size 21659418140 diff --git a/checkpoint-72/global_step72/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-72/global_step72/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9a0da5bf3145112ad9543a5e06e24125ece976e --- /dev/null +++ b/checkpoint-72/global_step72/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a16c25b7ca06f6e826a7955746fc2b6b9de33e9e9b469c14e8e6d0852997eb1f +size 21659457372 diff --git a/checkpoint-72/global_step72/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-72/global_step72/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b2d8f65aba8bba1d7c0fee6f6184822203f984c --- /dev/null +++ b/checkpoint-72/global_step72/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4c9506b7eece856bd0fe49e523380ee24b468d501aaed57e17b5c94a5b9c50f +size 21659417820 diff --git a/checkpoint-72/global_step72/mp_rank_00_model_states.pt b/checkpoint-72/global_step72/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3e3682c450ffd4b21945dc79f617a41ae5db1cb --- /dev/null +++ b/checkpoint-72/global_step72/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39e4211f5c4aba9c19da4c7f110d32ce3642765b1f634082b5cd1bcead404340 +size 11918643933 diff --git a/checkpoint-72/latest b/checkpoint-72/latest new file mode 100644 index 0000000000000000000000000000000000000000..f3ff0f3ef57eac4f36c543b2d7ef78ca727041bd --- /dev/null +++ b/checkpoint-72/latest @@ -0,0 +1 @@ +global_step72 \ No newline at end of file diff --git a/checkpoint-72/rng_state_0.pth b/checkpoint-72/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e03436dd77f0f742b73e3f601a58d05364ee48b --- /dev/null +++ b/checkpoint-72/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:541a247a0499946942d469144d4609ab54f406a01327defecf24e55cce3eaaff +size 14768 diff --git a/checkpoint-72/rng_state_1.pth b/checkpoint-72/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d2065fa0d9a503d409eaed77bd3dafcec8c6e51 --- /dev/null +++ b/checkpoint-72/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55f595d3bc4cf74ef1c4bf07834b2d3c1153e4c96ec66ee50cd533cd68d3f2be +size 14768 diff --git a/checkpoint-72/rng_state_2.pth b/checkpoint-72/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b59ecfae7f98fa951c562700fd917c39af7c9ffe --- /dev/null +++ b/checkpoint-72/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a141ba5106d9cb0d6d4ea1db081a08d8a6182e2ca548def74038dc2ab25e5894 +size 14768 diff --git a/checkpoint-72/scheduler.pt b/checkpoint-72/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..324d6e5ba59dc80aafdba02fe2ebc9eac737c54b --- /dev/null +++ b/checkpoint-72/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14d3ad851fc136efe822990f8b99840e98b2ff20804944bcf122f2cafb45ed1f +size 1064 diff --git a/checkpoint-72/special_tokens_map.json b/checkpoint-72/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-72/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-72/tokenizer.json b/checkpoint-72/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-72/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-72/tokenizer_config.json b/checkpoint-72/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-72/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-72/trainer_state.json b/checkpoint-72/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7ff93026ad84fd0c489668ee9f7488080ce58872 --- /dev/null +++ b/checkpoint-72/trainer_state.json @@ -0,0 +1,537 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9953917050691244, + "eval_steps": 500, + "global_step": 72, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.013824884792626729, + "grad_norm": 31.00213623046875, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2089, + "step": 1 + }, + { + "epoch": 0.027649769585253458, + "grad_norm": 30.27136993408203, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.1536, + "step": 2 + }, + { + "epoch": 0.041474654377880185, + "grad_norm": 30.48703384399414, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.1581, + "step": 3 + }, + { + "epoch": 0.055299539170506916, + "grad_norm": 30.779329299926758, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.1741, + "step": 4 + }, + { + "epoch": 0.06912442396313365, + "grad_norm": 31.22808837890625, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.1864, + "step": 5 + }, + { + "epoch": 0.08294930875576037, + "grad_norm": 30.783327102661133, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.1993, + "step": 6 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 30.57423210144043, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.1506, + "step": 7 + }, + { + "epoch": 0.11059907834101383, + "grad_norm": 30.952186584472656, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.1599, + "step": 8 + }, + { + "epoch": 0.12442396313364056, + "grad_norm": 30.37245750427246, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.1572, + "step": 9 + }, + { + "epoch": 0.1382488479262673, + "grad_norm": 30.930192947387695, + "learning_rate": 5.000000000000001e-07, + "loss": 2.1447, + "step": 10 + }, + { + "epoch": 0.15207373271889402, + "grad_norm": 29.735448837280273, + "learning_rate": 5.5e-07, + "loss": 2.0742, + "step": 11 + }, + { + "epoch": 0.16589861751152074, + "grad_norm": 29.62826156616211, + "learning_rate": 6.000000000000001e-07, + "loss": 2.061, + "step": 12 + }, + { + "epoch": 0.17972350230414746, + "grad_norm": 28.937463760375977, + "learning_rate": 6.5e-07, + "loss": 1.9974, + "step": 13 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 29.24833106994629, + "learning_rate": 7.000000000000001e-07, + "loss": 1.9833, + "step": 14 + }, + { + "epoch": 0.2073732718894009, + "grad_norm": 28.122018814086914, + "learning_rate": 7.5e-07, + "loss": 1.8934, + "step": 15 + }, + { + "epoch": 0.22119815668202766, + "grad_norm": 28.059659957885742, + "learning_rate": 8.000000000000001e-07, + "loss": 1.875, + "step": 16 + }, + { + "epoch": 0.2350230414746544, + "grad_norm": 27.361961364746094, + "learning_rate": 8.500000000000001e-07, + "loss": 1.8009, + "step": 17 + }, + { + "epoch": 0.2488479262672811, + "grad_norm": 26.721765518188477, + "learning_rate": 9.000000000000001e-07, + "loss": 1.7116, + "step": 18 + }, + { + "epoch": 0.2626728110599078, + "grad_norm": 25.37330436706543, + "learning_rate": 9.500000000000001e-07, + "loss": 1.5608, + "step": 19 + }, + { + "epoch": 0.2764976958525346, + "grad_norm": 25.81206703186035, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5043, + "step": 20 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 25.539344787597656, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.3673, + "step": 21 + }, + { + "epoch": 0.30414746543778803, + "grad_norm": 25.097164154052734, + "learning_rate": 1.1e-06, + "loss": 1.2029, + "step": 22 + }, + { + "epoch": 0.31797235023041476, + "grad_norm": 24.619497299194336, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.0458, + "step": 23 + }, + { + "epoch": 0.3317972350230415, + "grad_norm": 23.820302963256836, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.8723, + "step": 24 + }, + { + "epoch": 0.3456221198156682, + "grad_norm": 23.12735939025879, + "learning_rate": 1.25e-06, + "loss": 0.7183, + "step": 25 + }, + { + "epoch": 0.35944700460829493, + "grad_norm": 20.127134323120117, + "learning_rate": 1.3e-06, + "loss": 0.5248, + "step": 26 + }, + { + "epoch": 0.37327188940092165, + "grad_norm": 15.901495933532715, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.3689, + "step": 27 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 11.053832054138184, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.2482, + "step": 28 + }, + { + "epoch": 0.4009216589861751, + "grad_norm": 7.248495578765869, + "learning_rate": 1.45e-06, + "loss": 0.1847, + "step": 29 + }, + { + "epoch": 0.4147465437788018, + "grad_norm": 5.378540515899658, + "learning_rate": 1.5e-06, + "loss": 0.1423, + "step": 30 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 3.8371808528900146, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1152, + "step": 31 + }, + { + "epoch": 0.4423963133640553, + "grad_norm": 2.2655274868011475, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0845, + "step": 32 + }, + { + "epoch": 0.45622119815668205, + "grad_norm": 1.5746861696243286, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0711, + "step": 33 + }, + { + "epoch": 0.4700460829493088, + "grad_norm": 1.3510947227478027, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0734, + "step": 34 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.9737389087677002, + "learning_rate": 1.75e-06, + "loss": 0.0651, + "step": 35 + }, + { + "epoch": 0.4976958525345622, + "grad_norm": 0.9815284609794617, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0593, + "step": 36 + }, + { + "epoch": 0.511520737327189, + "grad_norm": 0.8567912578582764, + "learning_rate": 1.85e-06, + "loss": 0.0543, + "step": 37 + }, + { + "epoch": 0.5253456221198156, + "grad_norm": 0.6773302555084229, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0622, + "step": 38 + }, + { + "epoch": 0.5391705069124424, + "grad_norm": 0.49936285614967346, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0511, + "step": 39 + }, + { + "epoch": 0.5529953917050692, + "grad_norm": 0.6253588795661926, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0478, + "step": 40 + }, + { + "epoch": 0.5668202764976958, + "grad_norm": 0.5103089809417725, + "learning_rate": 2.05e-06, + "loss": 0.0465, + "step": 41 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.29294702410697937, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0456, + "step": 42 + }, + { + "epoch": 0.5944700460829493, + "grad_norm": 0.4237954616546631, + "learning_rate": 2.15e-06, + "loss": 0.0501, + "step": 43 + }, + { + "epoch": 0.6082949308755761, + "grad_norm": 0.42243412137031555, + "learning_rate": 2.2e-06, + "loss": 0.0388, + "step": 44 + }, + { + "epoch": 0.6221198156682027, + "grad_norm": 0.37881818413734436, + "learning_rate": 2.25e-06, + "loss": 0.0415, + "step": 45 + }, + { + "epoch": 0.6359447004608295, + "grad_norm": 0.4941152036190033, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.045, + "step": 46 + }, + { + "epoch": 0.6497695852534562, + "grad_norm": 0.3046450912952423, + "learning_rate": 2.35e-06, + "loss": 0.0386, + "step": 47 + }, + { + "epoch": 0.663594470046083, + "grad_norm": 0.39361852407455444, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0447, + "step": 48 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.5190001130104065, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.0364, + "step": 49 + }, + { + "epoch": 0.6912442396313364, + "grad_norm": 0.372072696685791, + "learning_rate": 2.5e-06, + "loss": 0.043, + "step": 50 + }, + { + "epoch": 0.7050691244239631, + "grad_norm": 0.3756551146507263, + "learning_rate": 2.55e-06, + "loss": 0.0424, + "step": 51 + }, + { + "epoch": 0.7188940092165899, + "grad_norm": 0.4593554437160492, + "learning_rate": 2.6e-06, + "loss": 0.0387, + "step": 52 + }, + { + "epoch": 0.7327188940092166, + "grad_norm": 0.2931855618953705, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0396, + "step": 53 + }, + { + "epoch": 0.7465437788018433, + "grad_norm": 0.38429534435272217, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0373, + "step": 54 + }, + { + "epoch": 0.7603686635944701, + "grad_norm": 0.3506857752799988, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.04, + "step": 55 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.29847028851509094, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0369, + "step": 56 + }, + { + "epoch": 0.7880184331797235, + "grad_norm": 0.3653375506401062, + "learning_rate": 2.85e-06, + "loss": 0.0396, + "step": 57 + }, + { + "epoch": 0.8018433179723502, + "grad_norm": 0.3163083791732788, + "learning_rate": 2.9e-06, + "loss": 0.0337, + "step": 58 + }, + { + "epoch": 0.815668202764977, + "grad_norm": 0.3734363615512848, + "learning_rate": 2.95e-06, + "loss": 0.0327, + "step": 59 + }, + { + "epoch": 0.8294930875576036, + "grad_norm": 0.29547712206840515, + "learning_rate": 3e-06, + "loss": 0.0365, + "step": 60 + }, + { + "epoch": 0.8433179723502304, + "grad_norm": 0.4041007161140442, + "learning_rate": 3.05e-06, + "loss": 0.038, + "step": 61 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.3602149784564972, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.033, + "step": 62 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 0.2948857545852661, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0386, + "step": 63 + }, + { + "epoch": 0.8847926267281107, + "grad_norm": 0.39098358154296875, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0323, + "step": 64 + }, + { + "epoch": 0.8986175115207373, + "grad_norm": 0.3692062795162201, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0309, + "step": 65 + }, + { + "epoch": 0.9124423963133641, + "grad_norm": 0.3967229425907135, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0346, + "step": 66 + }, + { + "epoch": 0.9262672811059908, + "grad_norm": 0.47776708006858826, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0355, + "step": 67 + }, + { + "epoch": 0.9400921658986175, + "grad_norm": 0.21545131504535675, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0294, + "step": 68 + }, + { + "epoch": 0.9539170506912442, + "grad_norm": 0.23738539218902588, + "learning_rate": 3.45e-06, + "loss": 0.0308, + "step": 69 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.29174014925956726, + "learning_rate": 3.5e-06, + "loss": 0.0312, + "step": 70 + }, + { + "epoch": 0.9815668202764977, + "grad_norm": 0.38475602865219116, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0324, + "step": 71 + }, + { + "epoch": 0.9953917050691244, + "grad_norm": 0.4077378809452057, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0297, + "step": 72 + } + ], + "logging_steps": 1, + "max_steps": 432, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 72, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.631891949769458e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-72/training_args.bin b/checkpoint-72/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..66c881ded23f97eecfbd08abb955a7188907de16 --- /dev/null +++ b/checkpoint-72/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bccf4859d2ee74fd992386ba4a70a4c4fc6d0da061af69465c1db71ce0f24882 +size 7928 diff --git a/checkpoint-72/zero_to_fp32.py b/checkpoint-72/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-72/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..358245ff2f9aa0dab8c73b59c5e35464c3e7b467 --- /dev/null +++ b/config.json @@ -0,0 +1,52 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.1-70B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 8192, + "initializer_range": 0.02, + "intermediate_size": 28672, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 64, + "num_hidden_layers": 80, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "quantization_config": { + "_load_in_4bit": true, + "_load_in_8bit": false, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_storage": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "llm_int8_enable_fp32_cpu_offload": false, + "llm_int8_has_fp16_weight": false, + "llm_int8_skip_modules": null, + "llm_int8_threshold": 6.0, + "load_in_4bit": true, + "load_in_8bit": false, + "quant_method": "bitsandbytes" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.49.0", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +}