diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d0eb64aa91f4e4c7d9d049781777bb5618c7fb58 --- /dev/null +++ b/README.md @@ -0,0 +1,143 @@ +--- +library_name: peft +license: llama3.1 +base_model: meta-llama/Llama-3.1-8B-Instruct +tags: +- generated_from_trainer +datasets: +- ugaoo/medmcqa30k_normal +model-index: +- name: out/meta_llama_Llama_3.1_8B_Instruct_ugaoo_medmcqa30k_normal + results: [] +--- + + + +[Built with Axolotl](https://github.com/axolotl-ai-cloud/axolotl) +
See axolotl config + +axolotl version: `0.8.0.dev0` +```yaml +base_model: meta-llama/Llama-3.1-8B-Instruct +model_type: AutoModelForCausalLM +tokenizer_type: AutoTokenizer +trust_remote_code: true + +load_in_8bit: false +load_in_4bit: true +strict: false + +datasets: + - path: ugaoo/medmcqa30k_normal + type: alpaca +val_set_size: 0 +output_dir: ./out/meta_llama_Llama_3.1_8B_Instruct_ugaoo_medmcqa30k_normal + +sequence_len: 4000 +sample_packing: true +pad_to_sequence_len: true + +adapter: qlora +lora_r: 256 +lora_alpha: 512 +lora_dropout: 0.05 +lora_target_linear: true +lora_target_modules: + - q_proj + - k_proj + - v_proj + - o_proj + - up_proj + - down_proj + - gate_proj +lora_modules_to_save: + - embed_tokens + - lm_head + +wandb_project: testsearch +wandb_entity: +wandb_watch: +wandb_name: meta_llama_Llama_3.1_8B_Instruct_ugaoo_medmcqa30k_normal +wandb_log_model: + +gradient_accumulation_steps: 3 +micro_batch_size: 4 +num_epochs: 6 +optimizer: adamw_torch +lr_scheduler: cosine +learning_rate: 5e-6 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 100 +evals_per_epoch: 6 +eval_table_size: +saves_per_epoch: 1 +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +save_total_limit: 6 +special_tokens: + pad_token: <|end_of_text|> +``` + +

+ +# out/meta_llama_Llama_3.1_8B_Instruct_ugaoo_medmcqa30k_normal + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the ugaoo/medmcqa30k_normal dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-06 +- train_batch_size: 4 +- eval_batch_size: 4 +- seed: 42 +- distributed_type: multi-GPU +- gradient_accumulation_steps: 3 +- total_train_batch_size: 12 +- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 100 +- num_epochs: 6.0 + +### Training results + + + +### Framework versions + +- PEFT 0.14.0 +- Transformers 4.49.0 +- Pytorch 2.5.1+cu124 +- Datasets 3.2.0 +- Tokenizers 0.21.0 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json index 56c2f8e294f992c387eec3239301f13679bdf4e8..aaa71b6240dcb4147fb982eb2f0ff89574c4fb31 100644 --- a/adapter_config.json +++ b/adapter_config.json @@ -6,7 +6,7 @@ "eva_config": null, "exclude_modules": null, "fan_in_fan_out": null, - "inference_mode": false, + "inference_mode": true, "init_lora_weights": true, "layer_replication": null, "layers_pattern": null, diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..53315ad4415ef8b3e9fcdaf52f955692b9fe42c8 --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f9101224637dbc63c919a24fefc3d52e1b9fc2dc8d5355d259947e5d493e419 +size 3443586272 diff --git a/checkpoint-122/README.md b/checkpoint-122/README.md new file mode 100644 index 0000000000000000000000000000000000000000..be5c87703f12b547886cc6a2ecfbe9ee150496fa --- /dev/null +++ b/checkpoint-122/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-122/adapter_config.json b/checkpoint-122/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aaa71b6240dcb4147fb982eb2f0ff89574c4fb31 --- /dev/null +++ b/checkpoint-122/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "up_proj", + "gate_proj", + "v_proj", + "down_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-122/adapter_model.safetensors b/checkpoint-122/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f8a5c01a5a3126112945e985e0ecccb8c468de06 --- /dev/null +++ b/checkpoint-122/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0aabc8d8bd29a9af2ccff4c45bf700853ecc5ce435007a2ef858879cd056a5f +size 3443586272 diff --git a/checkpoint-122/global_step121/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-122/global_step121/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34cff079c5fbe25043b952dac97c4372c6f2f0e3 --- /dev/null +++ b/checkpoint-122/global_step121/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf4fc747e772caa535bcc5305202dbcede4cdae392d99362f0d6405303e2140b +size 20661195036 diff --git a/checkpoint-122/global_step121/mp_rank_00_model_states.pt b/checkpoint-122/global_step121/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6675c17aa873b40a25d63e892911cd1c999a1975 --- /dev/null +++ b/checkpoint-122/global_step121/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d2ac16e72a47da966794503bf8e8b400e4ce17114bdbd2396795af984866c62 +size 3555326649 diff --git a/checkpoint-122/latest b/checkpoint-122/latest new file mode 100644 index 0000000000000000000000000000000000000000..9514df933ccf9579207bb754da90ca456691308e --- /dev/null +++ b/checkpoint-122/latest @@ -0,0 +1 @@ +global_step121 \ No newline at end of file diff --git a/checkpoint-122/rng_state.pth b/checkpoint-122/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..91475b704072c73a37ce284bc80ae0c64faeeab5 --- /dev/null +++ b/checkpoint-122/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:349d94629f281b4f86b82d7ad76484f15b03acf8791adcbede5027bdab09a1d0 +size 14244 diff --git a/checkpoint-122/scheduler.pt b/checkpoint-122/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c204f93fae47593732bb601fa454b76e386d707f --- /dev/null +++ b/checkpoint-122/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:888fd97ed86ec79d32749f41b60052b780850ebdef2267edee04b642afcac14a +size 1064 diff --git a/checkpoint-122/special_tokens_map.json b/checkpoint-122/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-122/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-122/tokenizer.json b/checkpoint-122/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-122/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-122/tokenizer_config.json b/checkpoint-122/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-122/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-122/trainer_state.json b/checkpoint-122/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..017f3f4dfe81353b474617c92418af85ddcc25c0 --- /dev/null +++ b/checkpoint-122/trainer_state.json @@ -0,0 +1,887 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.972972972972973, + "eval_steps": 500, + "global_step": 122, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016216216216216217, + "grad_norm": 39.12052917480469, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2957, + "step": 1 + }, + { + "epoch": 0.032432432432432434, + "grad_norm": 38.9581413269043, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.2959, + "step": 2 + }, + { + "epoch": 0.04864864864864865, + "grad_norm": 39.2702751159668, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.2677, + "step": 3 + }, + { + "epoch": 0.06486486486486487, + "grad_norm": 39.18815231323242, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.2936, + "step": 4 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 38.66701889038086, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.2561, + "step": 5 + }, + { + "epoch": 0.0972972972972973, + "grad_norm": 39.53536605834961, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.2579, + "step": 6 + }, + { + "epoch": 0.11351351351351352, + "grad_norm": 39.3793830871582, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.2627, + "step": 7 + }, + { + "epoch": 0.12972972972972974, + "grad_norm": 39.88922119140625, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.2729, + "step": 8 + }, + { + "epoch": 0.14594594594594595, + "grad_norm": 37.9880256652832, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.2311, + "step": 9 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 37.024139404296875, + "learning_rate": 5.000000000000001e-07, + "loss": 2.1773, + "step": 10 + }, + { + "epoch": 0.1783783783783784, + "grad_norm": 36.89325714111328, + "learning_rate": 5.5e-07, + "loss": 2.1927, + "step": 11 + }, + { + "epoch": 0.1945945945945946, + "grad_norm": 37.244178771972656, + "learning_rate": 6.000000000000001e-07, + "loss": 2.1757, + "step": 12 + }, + { + "epoch": 0.21081081081081082, + "grad_norm": 34.77650451660156, + "learning_rate": 6.5e-07, + "loss": 2.0392, + "step": 13 + }, + { + "epoch": 0.22702702702702704, + "grad_norm": 34.78818893432617, + "learning_rate": 7.000000000000001e-07, + "loss": 1.9996, + "step": 14 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 34.86852264404297, + "learning_rate": 7.5e-07, + "loss": 1.9496, + "step": 15 + }, + { + "epoch": 0.2594594594594595, + "grad_norm": 35.202796936035156, + "learning_rate": 8.000000000000001e-07, + "loss": 1.8542, + "step": 16 + }, + { + "epoch": 0.2756756756756757, + "grad_norm": 34.11354064941406, + "learning_rate": 8.500000000000001e-07, + "loss": 1.7118, + "step": 17 + }, + { + "epoch": 0.2918918918918919, + "grad_norm": 36.309059143066406, + "learning_rate": 9.000000000000001e-07, + "loss": 1.6834, + "step": 18 + }, + { + "epoch": 0.3081081081081081, + "grad_norm": 34.69994354248047, + "learning_rate": 9.500000000000001e-07, + "loss": 1.5298, + "step": 19 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 35.43153381347656, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.4191, + "step": 20 + }, + { + "epoch": 0.34054054054054056, + "grad_norm": 33.53745651245117, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.3068, + "step": 21 + }, + { + "epoch": 0.3567567567567568, + "grad_norm": 33.775604248046875, + "learning_rate": 1.1e-06, + "loss": 1.224, + "step": 22 + }, + { + "epoch": 0.372972972972973, + "grad_norm": 30.57005500793457, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.0704, + "step": 23 + }, + { + "epoch": 0.3891891891891892, + "grad_norm": 27.964860916137695, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.9548, + "step": 24 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 26.023576736450195, + "learning_rate": 1.25e-06, + "loss": 0.8503, + "step": 25 + }, + { + "epoch": 0.42162162162162165, + "grad_norm": 25.0452938079834, + "learning_rate": 1.3e-06, + "loss": 0.6938, + "step": 26 + }, + { + "epoch": 0.43783783783783786, + "grad_norm": 24.663373947143555, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.5648, + "step": 27 + }, + { + "epoch": 0.4540540540540541, + "grad_norm": 21.61736488342285, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.435, + "step": 28 + }, + { + "epoch": 0.4702702702702703, + "grad_norm": 18.3259334564209, + "learning_rate": 1.45e-06, + "loss": 0.3322, + "step": 29 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 16.80081558227539, + "learning_rate": 1.5e-06, + "loss": 0.2625, + "step": 30 + }, + { + "epoch": 0.5027027027027027, + "grad_norm": 14.789258003234863, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1757, + "step": 31 + }, + { + "epoch": 0.518918918918919, + "grad_norm": 10.406538963317871, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1376, + "step": 32 + }, + { + "epoch": 0.5351351351351351, + "grad_norm": 4.868802547454834, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0815, + "step": 33 + }, + { + "epoch": 0.5513513513513514, + "grad_norm": 1.8639686107635498, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0628, + "step": 34 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 1.897918462753296, + "learning_rate": 1.75e-06, + "loss": 0.0775, + "step": 35 + }, + { + "epoch": 0.5837837837837838, + "grad_norm": 1.296712040901184, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0565, + "step": 36 + }, + { + "epoch": 0.6, + "grad_norm": 1.0163214206695557, + "learning_rate": 1.85e-06, + "loss": 0.0544, + "step": 37 + }, + { + "epoch": 0.6162162162162163, + "grad_norm": 1.070162296295166, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0621, + "step": 38 + }, + { + "epoch": 0.6324324324324324, + "grad_norm": 1.024267315864563, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0566, + "step": 39 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.9016611576080322, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0511, + "step": 40 + }, + { + "epoch": 0.6648648648648648, + "grad_norm": 0.8272562623023987, + "learning_rate": 2.05e-06, + "loss": 0.0533, + "step": 41 + }, + { + "epoch": 0.6810810810810811, + "grad_norm": 0.8875278234481812, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0495, + "step": 42 + }, + { + "epoch": 0.6972972972972973, + "grad_norm": 0.8804877996444702, + "learning_rate": 2.15e-06, + "loss": 0.0506, + "step": 43 + }, + { + "epoch": 0.7135135135135136, + "grad_norm": 0.7133358120918274, + "learning_rate": 2.2e-06, + "loss": 0.0467, + "step": 44 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 0.8142214417457581, + "learning_rate": 2.25e-06, + "loss": 0.0552, + "step": 45 + }, + { + "epoch": 0.745945945945946, + "grad_norm": 0.8341564536094666, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0574, + "step": 46 + }, + { + "epoch": 0.7621621621621621, + "grad_norm": 0.6500507593154907, + "learning_rate": 2.35e-06, + "loss": 0.0398, + "step": 47 + }, + { + "epoch": 0.7783783783783784, + "grad_norm": 0.6163598895072937, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0459, + "step": 48 + }, + { + "epoch": 0.7945945945945946, + "grad_norm": 0.663949191570282, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.046, + "step": 49 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7521553635597229, + "learning_rate": 2.5e-06, + "loss": 0.0525, + "step": 50 + }, + { + "epoch": 0.827027027027027, + "grad_norm": 0.7828383445739746, + "learning_rate": 2.55e-06, + "loss": 0.0558, + "step": 51 + }, + { + "epoch": 0.8432432432432433, + "grad_norm": 0.7935078740119934, + "learning_rate": 2.6e-06, + "loss": 0.0451, + "step": 52 + }, + { + "epoch": 0.8594594594594595, + "grad_norm": 0.6327880620956421, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0403, + "step": 53 + }, + { + "epoch": 0.8756756756756757, + "grad_norm": 0.6185981035232544, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0406, + "step": 54 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 0.5417979955673218, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0426, + "step": 55 + }, + { + "epoch": 0.9081081081081082, + "grad_norm": 0.7140630483627319, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0446, + "step": 56 + }, + { + "epoch": 0.9243243243243243, + "grad_norm": 0.7191944122314453, + "learning_rate": 2.85e-06, + "loss": 0.047, + "step": 57 + }, + { + "epoch": 0.9405405405405406, + "grad_norm": 0.7562940716743469, + "learning_rate": 2.9e-06, + "loss": 0.0476, + "step": 58 + }, + { + "epoch": 0.9567567567567568, + "grad_norm": 0.7422239184379578, + "learning_rate": 2.95e-06, + "loss": 0.0462, + "step": 59 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 0.677144467830658, + "learning_rate": 3e-06, + "loss": 0.0475, + "step": 60 + }, + { + "epoch": 0.9891891891891892, + "grad_norm": 0.6127192974090576, + "learning_rate": 3.05e-06, + "loss": 0.0434, + "step": 61 + }, + { + "epoch": 1.0, + "grad_norm": 0.6127192974090576, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0375, + "step": 62 + }, + { + "epoch": 1.0162162162162163, + "grad_norm": 0.959559440612793, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0421, + "step": 63 + }, + { + "epoch": 1.0324324324324325, + "grad_norm": 0.6539880037307739, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0414, + "step": 64 + }, + { + "epoch": 1.0486486486486486, + "grad_norm": 0.5929313898086548, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0451, + "step": 65 + }, + { + "epoch": 1.0648648648648649, + "grad_norm": 0.6479571461677551, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0415, + "step": 66 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.5496926307678223, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0366, + "step": 67 + }, + { + "epoch": 1.0972972972972972, + "grad_norm": 0.5373682379722595, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0383, + "step": 68 + }, + { + "epoch": 1.1135135135135135, + "grad_norm": 0.5489712357521057, + "learning_rate": 3.45e-06, + "loss": 0.0427, + "step": 69 + }, + { + "epoch": 1.1297297297297297, + "grad_norm": 0.6830047369003296, + "learning_rate": 3.5e-06, + "loss": 0.039, + "step": 70 + }, + { + "epoch": 1.145945945945946, + "grad_norm": 0.5794199705123901, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0409, + "step": 71 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 0.571513831615448, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0392, + "step": 72 + }, + { + "epoch": 1.1783783783783783, + "grad_norm": 0.7753933668136597, + "learning_rate": 3.65e-06, + "loss": 0.0365, + "step": 73 + }, + { + "epoch": 1.1945945945945946, + "grad_norm": 0.6135310530662537, + "learning_rate": 3.7e-06, + "loss": 0.036, + "step": 74 + }, + { + "epoch": 1.2108108108108109, + "grad_norm": 0.5497344136238098, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.035, + "step": 75 + }, + { + "epoch": 1.227027027027027, + "grad_norm": 0.5861782431602478, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0434, + "step": 76 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 0.6941010355949402, + "learning_rate": 3.85e-06, + "loss": 0.0336, + "step": 77 + }, + { + "epoch": 1.2594594594594595, + "grad_norm": 0.5305830240249634, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0391, + "step": 78 + }, + { + "epoch": 1.2756756756756757, + "grad_norm": 0.6456385254859924, + "learning_rate": 3.95e-06, + "loss": 0.0422, + "step": 79 + }, + { + "epoch": 1.291891891891892, + "grad_norm": 0.5704363584518433, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0342, + "step": 80 + }, + { + "epoch": 1.308108108108108, + "grad_norm": 0.5257390141487122, + "learning_rate": 4.05e-06, + "loss": 0.0369, + "step": 81 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 0.5541989207267761, + "learning_rate": 4.1e-06, + "loss": 0.0331, + "step": 82 + }, + { + "epoch": 1.3405405405405406, + "grad_norm": 0.7190688252449036, + "learning_rate": 4.15e-06, + "loss": 0.039, + "step": 83 + }, + { + "epoch": 1.3567567567567567, + "grad_norm": 0.4766721725463867, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0354, + "step": 84 + }, + { + "epoch": 1.372972972972973, + "grad_norm": 0.5847981572151184, + "learning_rate": 4.25e-06, + "loss": 0.0355, + "step": 85 + }, + { + "epoch": 1.3891891891891892, + "grad_norm": 0.6361181139945984, + "learning_rate": 4.3e-06, + "loss": 0.0415, + "step": 86 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 0.6437036395072937, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0353, + "step": 87 + }, + { + "epoch": 1.4216216216216218, + "grad_norm": 0.712043046951294, + "learning_rate": 4.4e-06, + "loss": 0.0311, + "step": 88 + }, + { + "epoch": 1.4378378378378378, + "grad_norm": 0.5829771757125854, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0433, + "step": 89 + }, + { + "epoch": 1.454054054054054, + "grad_norm": 0.6977937817573547, + "learning_rate": 4.5e-06, + "loss": 0.0391, + "step": 90 + }, + { + "epoch": 1.4702702702702704, + "grad_norm": 0.49931228160858154, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0352, + "step": 91 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.5281490683555603, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0385, + "step": 92 + }, + { + "epoch": 1.5027027027027027, + "grad_norm": 0.613349974155426, + "learning_rate": 4.65e-06, + "loss": 0.0399, + "step": 93 + }, + { + "epoch": 1.518918918918919, + "grad_norm": 0.6584879755973816, + "learning_rate": 4.7e-06, + "loss": 0.043, + "step": 94 + }, + { + "epoch": 1.535135135135135, + "grad_norm": 0.6006895303726196, + "learning_rate": 4.75e-06, + "loss": 0.0372, + "step": 95 + }, + { + "epoch": 1.5513513513513515, + "grad_norm": 0.5364943146705627, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0384, + "step": 96 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 0.4963968098163605, + "learning_rate": 4.85e-06, + "loss": 0.0324, + "step": 97 + }, + { + "epoch": 1.5837837837837838, + "grad_norm": 0.5868538618087769, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0386, + "step": 98 + }, + { + "epoch": 1.6, + "grad_norm": 0.6690974235534668, + "learning_rate": 4.95e-06, + "loss": 0.0332, + "step": 99 + }, + { + "epoch": 1.6162162162162161, + "grad_norm": 0.6118388175964355, + "learning_rate": 5e-06, + "loss": 0.0398, + "step": 100 + }, + { + "epoch": 1.6324324324324324, + "grad_norm": 0.6872304677963257, + "learning_rate": 4.999825642177387e-06, + "loss": 0.0333, + "step": 101 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 0.6457200646400452, + "learning_rate": 4.999302593030069e-06, + "loss": 0.0381, + "step": 102 + }, + { + "epoch": 1.6648648648648647, + "grad_norm": 0.6096416115760803, + "learning_rate": 4.998430925516213e-06, + "loss": 0.0385, + "step": 103 + }, + { + "epoch": 1.6810810810810812, + "grad_norm": 0.582796573638916, + "learning_rate": 4.99721076122146e-06, + "loss": 0.0317, + "step": 104 + }, + { + "epoch": 1.6972972972972973, + "grad_norm": 0.5576394200325012, + "learning_rate": 4.995642270341961e-06, + "loss": 0.0378, + "step": 105 + }, + { + "epoch": 1.7135135135135136, + "grad_norm": 0.7414760589599609, + "learning_rate": 4.99372567166064e-06, + "loss": 0.0403, + "step": 106 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 0.6029103994369507, + "learning_rate": 4.991461232516675e-06, + "loss": 0.0418, + "step": 107 + }, + { + "epoch": 1.7459459459459459, + "grad_norm": 0.771609365940094, + "learning_rate": 4.98884926876821e-06, + "loss": 0.0413, + "step": 108 + }, + { + "epoch": 1.7621621621621621, + "grad_norm": 0.6869891285896301, + "learning_rate": 4.9858901447482924e-06, + "loss": 0.0367, + "step": 109 + }, + { + "epoch": 1.7783783783783784, + "grad_norm": 0.4931647479534149, + "learning_rate": 4.982584273214061e-06, + "loss": 0.033, + "step": 110 + }, + { + "epoch": 1.7945945945945945, + "grad_norm": 0.5160052180290222, + "learning_rate": 4.978932115289165e-06, + "loss": 0.0357, + "step": 111 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 0.49750861525535583, + "learning_rate": 4.974934180399447e-06, + "loss": 0.0333, + "step": 112 + }, + { + "epoch": 1.827027027027027, + "grad_norm": 0.6596441864967346, + "learning_rate": 4.970591026201884e-06, + "loss": 0.0354, + "step": 113 + }, + { + "epoch": 1.8432432432432433, + "grad_norm": 0.6613579988479614, + "learning_rate": 4.965903258506806e-06, + "loss": 0.0377, + "step": 114 + }, + { + "epoch": 1.8594594594594596, + "grad_norm": 0.5383866429328918, + "learning_rate": 4.9608715311933865e-06, + "loss": 0.0418, + "step": 115 + }, + { + "epoch": 1.8756756756756756, + "grad_norm": 0.6303413510322571, + "learning_rate": 4.955496546118439e-06, + "loss": 0.0351, + "step": 116 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.5293605923652649, + "learning_rate": 4.949779053018519e-06, + "loss": 0.0322, + "step": 117 + }, + { + "epoch": 1.9081081081081082, + "grad_norm": 0.5211143493652344, + "learning_rate": 4.943719849405347e-06, + "loss": 0.0374, + "step": 118 + }, + { + "epoch": 1.9243243243243242, + "grad_norm": 0.5933778882026672, + "learning_rate": 4.937319780454559e-06, + "loss": 0.0377, + "step": 119 + }, + { + "epoch": 1.9405405405405407, + "grad_norm": 0.6020687818527222, + "learning_rate": 4.930579738887827e-06, + "loss": 0.0313, + "step": 120 + }, + { + "epoch": 1.9567567567567568, + "grad_norm": 0.7828154563903809, + "learning_rate": 4.923500664848327e-06, + "loss": 0.0372, + "step": 121 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 0.6172424554824829, + "learning_rate": 4.9160835457696075e-06, + "loss": 0.0387, + "step": 122 + } + ], + "logging_steps": 1, + "max_steps": 366, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 61, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.069134586563789e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-122/training_args.bin b/checkpoint-122/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5b28d107f55169977eced33ac6929abb398bb2c5 --- /dev/null +++ b/checkpoint-122/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e2f1aaf0f48ae52048eea3703205522237e597bd418f53d57d152ef3ad9cbbc +size 8056 diff --git a/checkpoint-122/zero_to_fp32.py b/checkpoint-122/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-122/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-183/README.md b/checkpoint-183/README.md new file mode 100644 index 0000000000000000000000000000000000000000..be5c87703f12b547886cc6a2ecfbe9ee150496fa --- /dev/null +++ b/checkpoint-183/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-183/adapter_config.json b/checkpoint-183/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aaa71b6240dcb4147fb982eb2f0ff89574c4fb31 --- /dev/null +++ b/checkpoint-183/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "up_proj", + "gate_proj", + "v_proj", + "down_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-183/adapter_model.safetensors b/checkpoint-183/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5ea72cf9404b552b4e64bfef210c06527862d614 --- /dev/null +++ b/checkpoint-183/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26bc37551fb9d39ad6b39c832ebb8cf258aec2ce19960bdd07474f419a549d90 +size 3443586272 diff --git a/checkpoint-183/global_step182/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-183/global_step182/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bfe9c3d7b66b3eaf153a1ec678cf0fef66894cf6 --- /dev/null +++ b/checkpoint-183/global_step182/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6fb92c964e1654c04959f7efd021a2ceeda52719ef273904824c4dd19bbc0f0 +size 20661195036 diff --git a/checkpoint-183/global_step182/mp_rank_00_model_states.pt b/checkpoint-183/global_step182/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b890641df7b2b37f11a815c2f01361fe8ecaccf7 --- /dev/null +++ b/checkpoint-183/global_step182/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d09aa6d3db817dfaaabeb6c913598c07c3832e1a33f59df54c5fea046f2f1432 +size 3555326649 diff --git a/checkpoint-183/latest b/checkpoint-183/latest new file mode 100644 index 0000000000000000000000000000000000000000..2e27f8382bb9c13c17eaff9fb29dfe56e5456858 --- /dev/null +++ b/checkpoint-183/latest @@ -0,0 +1 @@ +global_step182 \ No newline at end of file diff --git a/checkpoint-183/rng_state.pth b/checkpoint-183/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..efeb8de2f7e6c9b1bc05398cc8da83b3eab8a94f --- /dev/null +++ b/checkpoint-183/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49a17ac1996be10c867f8f6e6b90aa274f74ec7f58fd32b42705450d9ba5c16e +size 14244 diff --git a/checkpoint-183/scheduler.pt b/checkpoint-183/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..80bed569b9a589c3baf9ca0e6f456f2d31ccdaba --- /dev/null +++ b/checkpoint-183/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:638be0ea97d5ae33636c6687142a00e0c87ea82a700820d19a9873974009b8e8 +size 1064 diff --git a/checkpoint-183/special_tokens_map.json b/checkpoint-183/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-183/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-183/tokenizer.json b/checkpoint-183/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-183/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-183/tokenizer_config.json b/checkpoint-183/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-183/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-183/trainer_state.json b/checkpoint-183/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1ea3bd8df5c9fd2732493894ba09353625b50b38 --- /dev/null +++ b/checkpoint-183/trainer_state.json @@ -0,0 +1,1314 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9567567567567568, + "eval_steps": 500, + "global_step": 183, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016216216216216217, + "grad_norm": 39.12052917480469, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2957, + "step": 1 + }, + { + "epoch": 0.032432432432432434, + "grad_norm": 38.9581413269043, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.2959, + "step": 2 + }, + { + "epoch": 0.04864864864864865, + "grad_norm": 39.2702751159668, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.2677, + "step": 3 + }, + { + "epoch": 0.06486486486486487, + "grad_norm": 39.18815231323242, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.2936, + "step": 4 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 38.66701889038086, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.2561, + "step": 5 + }, + { + "epoch": 0.0972972972972973, + "grad_norm": 39.53536605834961, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.2579, + "step": 6 + }, + { + "epoch": 0.11351351351351352, + "grad_norm": 39.3793830871582, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.2627, + "step": 7 + }, + { + "epoch": 0.12972972972972974, + "grad_norm": 39.88922119140625, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.2729, + "step": 8 + }, + { + "epoch": 0.14594594594594595, + "grad_norm": 37.9880256652832, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.2311, + "step": 9 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 37.024139404296875, + "learning_rate": 5.000000000000001e-07, + "loss": 2.1773, + "step": 10 + }, + { + "epoch": 0.1783783783783784, + "grad_norm": 36.89325714111328, + "learning_rate": 5.5e-07, + "loss": 2.1927, + "step": 11 + }, + { + "epoch": 0.1945945945945946, + "grad_norm": 37.244178771972656, + "learning_rate": 6.000000000000001e-07, + "loss": 2.1757, + "step": 12 + }, + { + "epoch": 0.21081081081081082, + "grad_norm": 34.77650451660156, + "learning_rate": 6.5e-07, + "loss": 2.0392, + "step": 13 + }, + { + "epoch": 0.22702702702702704, + "grad_norm": 34.78818893432617, + "learning_rate": 7.000000000000001e-07, + "loss": 1.9996, + "step": 14 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 34.86852264404297, + "learning_rate": 7.5e-07, + "loss": 1.9496, + "step": 15 + }, + { + "epoch": 0.2594594594594595, + "grad_norm": 35.202796936035156, + "learning_rate": 8.000000000000001e-07, + "loss": 1.8542, + "step": 16 + }, + { + "epoch": 0.2756756756756757, + "grad_norm": 34.11354064941406, + "learning_rate": 8.500000000000001e-07, + "loss": 1.7118, + "step": 17 + }, + { + "epoch": 0.2918918918918919, + "grad_norm": 36.309059143066406, + "learning_rate": 9.000000000000001e-07, + "loss": 1.6834, + "step": 18 + }, + { + "epoch": 0.3081081081081081, + "grad_norm": 34.69994354248047, + "learning_rate": 9.500000000000001e-07, + "loss": 1.5298, + "step": 19 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 35.43153381347656, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.4191, + "step": 20 + }, + { + "epoch": 0.34054054054054056, + "grad_norm": 33.53745651245117, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.3068, + "step": 21 + }, + { + "epoch": 0.3567567567567568, + "grad_norm": 33.775604248046875, + "learning_rate": 1.1e-06, + "loss": 1.224, + "step": 22 + }, + { + "epoch": 0.372972972972973, + "grad_norm": 30.57005500793457, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.0704, + "step": 23 + }, + { + "epoch": 0.3891891891891892, + "grad_norm": 27.964860916137695, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.9548, + "step": 24 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 26.023576736450195, + "learning_rate": 1.25e-06, + "loss": 0.8503, + "step": 25 + }, + { + "epoch": 0.42162162162162165, + "grad_norm": 25.0452938079834, + "learning_rate": 1.3e-06, + "loss": 0.6938, + "step": 26 + }, + { + "epoch": 0.43783783783783786, + "grad_norm": 24.663373947143555, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.5648, + "step": 27 + }, + { + "epoch": 0.4540540540540541, + "grad_norm": 21.61736488342285, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.435, + "step": 28 + }, + { + "epoch": 0.4702702702702703, + "grad_norm": 18.3259334564209, + "learning_rate": 1.45e-06, + "loss": 0.3322, + "step": 29 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 16.80081558227539, + "learning_rate": 1.5e-06, + "loss": 0.2625, + "step": 30 + }, + { + "epoch": 0.5027027027027027, + "grad_norm": 14.789258003234863, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1757, + "step": 31 + }, + { + "epoch": 0.518918918918919, + "grad_norm": 10.406538963317871, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1376, + "step": 32 + }, + { + "epoch": 0.5351351351351351, + "grad_norm": 4.868802547454834, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0815, + "step": 33 + }, + { + "epoch": 0.5513513513513514, + "grad_norm": 1.8639686107635498, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0628, + "step": 34 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 1.897918462753296, + "learning_rate": 1.75e-06, + "loss": 0.0775, + "step": 35 + }, + { + "epoch": 0.5837837837837838, + "grad_norm": 1.296712040901184, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0565, + "step": 36 + }, + { + "epoch": 0.6, + "grad_norm": 1.0163214206695557, + "learning_rate": 1.85e-06, + "loss": 0.0544, + "step": 37 + }, + { + "epoch": 0.6162162162162163, + "grad_norm": 1.070162296295166, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0621, + "step": 38 + }, + { + "epoch": 0.6324324324324324, + "grad_norm": 1.024267315864563, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0566, + "step": 39 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.9016611576080322, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0511, + "step": 40 + }, + { + "epoch": 0.6648648648648648, + "grad_norm": 0.8272562623023987, + "learning_rate": 2.05e-06, + "loss": 0.0533, + "step": 41 + }, + { + "epoch": 0.6810810810810811, + "grad_norm": 0.8875278234481812, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0495, + "step": 42 + }, + { + "epoch": 0.6972972972972973, + "grad_norm": 0.8804877996444702, + "learning_rate": 2.15e-06, + "loss": 0.0506, + "step": 43 + }, + { + "epoch": 0.7135135135135136, + "grad_norm": 0.7133358120918274, + "learning_rate": 2.2e-06, + "loss": 0.0467, + "step": 44 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 0.8142214417457581, + "learning_rate": 2.25e-06, + "loss": 0.0552, + "step": 45 + }, + { + "epoch": 0.745945945945946, + "grad_norm": 0.8341564536094666, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0574, + "step": 46 + }, + { + "epoch": 0.7621621621621621, + "grad_norm": 0.6500507593154907, + "learning_rate": 2.35e-06, + "loss": 0.0398, + "step": 47 + }, + { + "epoch": 0.7783783783783784, + "grad_norm": 0.6163598895072937, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0459, + "step": 48 + }, + { + "epoch": 0.7945945945945946, + "grad_norm": 0.663949191570282, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.046, + "step": 49 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7521553635597229, + "learning_rate": 2.5e-06, + "loss": 0.0525, + "step": 50 + }, + { + "epoch": 0.827027027027027, + "grad_norm": 0.7828383445739746, + "learning_rate": 2.55e-06, + "loss": 0.0558, + "step": 51 + }, + { + "epoch": 0.8432432432432433, + "grad_norm": 0.7935078740119934, + "learning_rate": 2.6e-06, + "loss": 0.0451, + "step": 52 + }, + { + "epoch": 0.8594594594594595, + "grad_norm": 0.6327880620956421, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0403, + "step": 53 + }, + { + "epoch": 0.8756756756756757, + "grad_norm": 0.6185981035232544, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0406, + "step": 54 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 0.5417979955673218, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0426, + "step": 55 + }, + { + "epoch": 0.9081081081081082, + "grad_norm": 0.7140630483627319, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0446, + "step": 56 + }, + { + "epoch": 0.9243243243243243, + "grad_norm": 0.7191944122314453, + "learning_rate": 2.85e-06, + "loss": 0.047, + "step": 57 + }, + { + "epoch": 0.9405405405405406, + "grad_norm": 0.7562940716743469, + "learning_rate": 2.9e-06, + "loss": 0.0476, + "step": 58 + }, + { + "epoch": 0.9567567567567568, + "grad_norm": 0.7422239184379578, + "learning_rate": 2.95e-06, + "loss": 0.0462, + "step": 59 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 0.677144467830658, + "learning_rate": 3e-06, + "loss": 0.0475, + "step": 60 + }, + { + "epoch": 0.9891891891891892, + "grad_norm": 0.6127192974090576, + "learning_rate": 3.05e-06, + "loss": 0.0434, + "step": 61 + }, + { + "epoch": 1.0, + "grad_norm": 0.6127192974090576, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0375, + "step": 62 + }, + { + "epoch": 1.0162162162162163, + "grad_norm": 0.959559440612793, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0421, + "step": 63 + }, + { + "epoch": 1.0324324324324325, + "grad_norm": 0.6539880037307739, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0414, + "step": 64 + }, + { + "epoch": 1.0486486486486486, + "grad_norm": 0.5929313898086548, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0451, + "step": 65 + }, + { + "epoch": 1.0648648648648649, + "grad_norm": 0.6479571461677551, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0415, + "step": 66 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.5496926307678223, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0366, + "step": 67 + }, + { + "epoch": 1.0972972972972972, + "grad_norm": 0.5373682379722595, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0383, + "step": 68 + }, + { + "epoch": 1.1135135135135135, + "grad_norm": 0.5489712357521057, + "learning_rate": 3.45e-06, + "loss": 0.0427, + "step": 69 + }, + { + "epoch": 1.1297297297297297, + "grad_norm": 0.6830047369003296, + "learning_rate": 3.5e-06, + "loss": 0.039, + "step": 70 + }, + { + "epoch": 1.145945945945946, + "grad_norm": 0.5794199705123901, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0409, + "step": 71 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 0.571513831615448, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0392, + "step": 72 + }, + { + "epoch": 1.1783783783783783, + "grad_norm": 0.7753933668136597, + "learning_rate": 3.65e-06, + "loss": 0.0365, + "step": 73 + }, + { + "epoch": 1.1945945945945946, + "grad_norm": 0.6135310530662537, + "learning_rate": 3.7e-06, + "loss": 0.036, + "step": 74 + }, + { + "epoch": 1.2108108108108109, + "grad_norm": 0.5497344136238098, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.035, + "step": 75 + }, + { + "epoch": 1.227027027027027, + "grad_norm": 0.5861782431602478, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0434, + "step": 76 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 0.6941010355949402, + "learning_rate": 3.85e-06, + "loss": 0.0336, + "step": 77 + }, + { + "epoch": 1.2594594594594595, + "grad_norm": 0.5305830240249634, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0391, + "step": 78 + }, + { + "epoch": 1.2756756756756757, + "grad_norm": 0.6456385254859924, + "learning_rate": 3.95e-06, + "loss": 0.0422, + "step": 79 + }, + { + "epoch": 1.291891891891892, + "grad_norm": 0.5704363584518433, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0342, + "step": 80 + }, + { + "epoch": 1.308108108108108, + "grad_norm": 0.5257390141487122, + "learning_rate": 4.05e-06, + "loss": 0.0369, + "step": 81 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 0.5541989207267761, + "learning_rate": 4.1e-06, + "loss": 0.0331, + "step": 82 + }, + { + "epoch": 1.3405405405405406, + "grad_norm": 0.7190688252449036, + "learning_rate": 4.15e-06, + "loss": 0.039, + "step": 83 + }, + { + "epoch": 1.3567567567567567, + "grad_norm": 0.4766721725463867, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0354, + "step": 84 + }, + { + "epoch": 1.372972972972973, + "grad_norm": 0.5847981572151184, + "learning_rate": 4.25e-06, + "loss": 0.0355, + "step": 85 + }, + { + "epoch": 1.3891891891891892, + "grad_norm": 0.6361181139945984, + "learning_rate": 4.3e-06, + "loss": 0.0415, + "step": 86 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 0.6437036395072937, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0353, + "step": 87 + }, + { + "epoch": 1.4216216216216218, + "grad_norm": 0.712043046951294, + "learning_rate": 4.4e-06, + "loss": 0.0311, + "step": 88 + }, + { + "epoch": 1.4378378378378378, + "grad_norm": 0.5829771757125854, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0433, + "step": 89 + }, + { + "epoch": 1.454054054054054, + "grad_norm": 0.6977937817573547, + "learning_rate": 4.5e-06, + "loss": 0.0391, + "step": 90 + }, + { + "epoch": 1.4702702702702704, + "grad_norm": 0.49931228160858154, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0352, + "step": 91 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.5281490683555603, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0385, + "step": 92 + }, + { + "epoch": 1.5027027027027027, + "grad_norm": 0.613349974155426, + "learning_rate": 4.65e-06, + "loss": 0.0399, + "step": 93 + }, + { + "epoch": 1.518918918918919, + "grad_norm": 0.6584879755973816, + "learning_rate": 4.7e-06, + "loss": 0.043, + "step": 94 + }, + { + "epoch": 1.535135135135135, + "grad_norm": 0.6006895303726196, + "learning_rate": 4.75e-06, + "loss": 0.0372, + "step": 95 + }, + { + "epoch": 1.5513513513513515, + "grad_norm": 0.5364943146705627, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0384, + "step": 96 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 0.4963968098163605, + "learning_rate": 4.85e-06, + "loss": 0.0324, + "step": 97 + }, + { + "epoch": 1.5837837837837838, + "grad_norm": 0.5868538618087769, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0386, + "step": 98 + }, + { + "epoch": 1.6, + "grad_norm": 0.6690974235534668, + "learning_rate": 4.95e-06, + "loss": 0.0332, + "step": 99 + }, + { + "epoch": 1.6162162162162161, + "grad_norm": 0.6118388175964355, + "learning_rate": 5e-06, + "loss": 0.0398, + "step": 100 + }, + { + "epoch": 1.6324324324324324, + "grad_norm": 0.6872304677963257, + "learning_rate": 4.999825642177387e-06, + "loss": 0.0333, + "step": 101 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 0.6457200646400452, + "learning_rate": 4.999302593030069e-06, + "loss": 0.0381, + "step": 102 + }, + { + "epoch": 1.6648648648648647, + "grad_norm": 0.6096416115760803, + "learning_rate": 4.998430925516213e-06, + "loss": 0.0385, + "step": 103 + }, + { + "epoch": 1.6810810810810812, + "grad_norm": 0.582796573638916, + "learning_rate": 4.99721076122146e-06, + "loss": 0.0317, + "step": 104 + }, + { + "epoch": 1.6972972972972973, + "grad_norm": 0.5576394200325012, + "learning_rate": 4.995642270341961e-06, + "loss": 0.0378, + "step": 105 + }, + { + "epoch": 1.7135135135135136, + "grad_norm": 0.7414760589599609, + "learning_rate": 4.99372567166064e-06, + "loss": 0.0403, + "step": 106 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 0.6029103994369507, + "learning_rate": 4.991461232516675e-06, + "loss": 0.0418, + "step": 107 + }, + { + "epoch": 1.7459459459459459, + "grad_norm": 0.771609365940094, + "learning_rate": 4.98884926876821e-06, + "loss": 0.0413, + "step": 108 + }, + { + "epoch": 1.7621621621621621, + "grad_norm": 0.6869891285896301, + "learning_rate": 4.9858901447482924e-06, + "loss": 0.0367, + "step": 109 + }, + { + "epoch": 1.7783783783783784, + "grad_norm": 0.4931647479534149, + "learning_rate": 4.982584273214061e-06, + "loss": 0.033, + "step": 110 + }, + { + "epoch": 1.7945945945945945, + "grad_norm": 0.5160052180290222, + "learning_rate": 4.978932115289165e-06, + "loss": 0.0357, + "step": 111 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 0.49750861525535583, + "learning_rate": 4.974934180399447e-06, + "loss": 0.0333, + "step": 112 + }, + { + "epoch": 1.827027027027027, + "grad_norm": 0.6596441864967346, + "learning_rate": 4.970591026201884e-06, + "loss": 0.0354, + "step": 113 + }, + { + "epoch": 1.8432432432432433, + "grad_norm": 0.6613579988479614, + "learning_rate": 4.965903258506806e-06, + "loss": 0.0377, + "step": 114 + }, + { + "epoch": 1.8594594594594596, + "grad_norm": 0.5383866429328918, + "learning_rate": 4.9608715311933865e-06, + "loss": 0.0418, + "step": 115 + }, + { + "epoch": 1.8756756756756756, + "grad_norm": 0.6303413510322571, + "learning_rate": 4.955496546118439e-06, + "loss": 0.0351, + "step": 116 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.5293605923652649, + "learning_rate": 4.949779053018519e-06, + "loss": 0.0322, + "step": 117 + }, + { + "epoch": 1.9081081081081082, + "grad_norm": 0.5211143493652344, + "learning_rate": 4.943719849405347e-06, + "loss": 0.0374, + "step": 118 + }, + { + "epoch": 1.9243243243243242, + "grad_norm": 0.5933778882026672, + "learning_rate": 4.937319780454559e-06, + "loss": 0.0377, + "step": 119 + }, + { + "epoch": 1.9405405405405407, + "grad_norm": 0.6020687818527222, + "learning_rate": 4.930579738887827e-06, + "loss": 0.0313, + "step": 120 + }, + { + "epoch": 1.9567567567567568, + "grad_norm": 0.7828154563903809, + "learning_rate": 4.923500664848327e-06, + "loss": 0.0372, + "step": 121 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 0.6172424554824829, + "learning_rate": 4.9160835457696075e-06, + "loss": 0.0387, + "step": 122 + }, + { + "epoch": 1.9891891891891893, + "grad_norm": 0.5671921372413635, + "learning_rate": 4.9083294162378545e-06, + "loss": 0.0346, + "step": 123 + }, + { + "epoch": 2.0, + "grad_norm": 1.0704405307769775, + "learning_rate": 4.900239357847582e-06, + "loss": 0.0298, + "step": 124 + }, + { + "epoch": 2.016216216216216, + "grad_norm": 0.5932011604309082, + "learning_rate": 4.891814499050762e-06, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 2.0324324324324325, + "grad_norm": 0.47397834062576294, + "learning_rate": 4.883056014999423e-06, + "loss": 0.0281, + "step": 126 + }, + { + "epoch": 2.0486486486486486, + "grad_norm": 0.538270115852356, + "learning_rate": 4.873965127381734e-06, + "loss": 0.0268, + "step": 127 + }, + { + "epoch": 2.064864864864865, + "grad_norm": 0.3924686908721924, + "learning_rate": 4.864543104251587e-06, + "loss": 0.02, + "step": 128 + }, + { + "epoch": 2.081081081081081, + "grad_norm": 0.5162842869758606, + "learning_rate": 4.854791259851735e-06, + "loss": 0.0237, + "step": 129 + }, + { + "epoch": 2.097297297297297, + "grad_norm": 0.4691126048564911, + "learning_rate": 4.844710954430464e-06, + "loss": 0.0224, + "step": 130 + }, + { + "epoch": 2.1135135135135137, + "grad_norm": 0.47650063037872314, + "learning_rate": 4.834303594051854e-06, + "loss": 0.0202, + "step": 131 + }, + { + "epoch": 2.1297297297297297, + "grad_norm": 0.5041627883911133, + "learning_rate": 4.823570630399665e-06, + "loss": 0.0228, + "step": 132 + }, + { + "epoch": 2.145945945945946, + "grad_norm": 0.5368483662605286, + "learning_rate": 4.812513560574832e-06, + "loss": 0.0241, + "step": 133 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.7245975732803345, + "learning_rate": 4.8011339268866505e-06, + "loss": 0.0294, + "step": 134 + }, + { + "epoch": 2.1783783783783783, + "grad_norm": 0.5506283044815063, + "learning_rate": 4.789433316637644e-06, + "loss": 0.0204, + "step": 135 + }, + { + "epoch": 2.1945945945945944, + "grad_norm": 0.5240617990493774, + "learning_rate": 4.777413361902152e-06, + "loss": 0.0227, + "step": 136 + }, + { + "epoch": 2.210810810810811, + "grad_norm": 0.6164438128471375, + "learning_rate": 4.765075739298683e-06, + "loss": 0.0209, + "step": 137 + }, + { + "epoch": 2.227027027027027, + "grad_norm": 0.551898181438446, + "learning_rate": 4.752422169756048e-06, + "loss": 0.0187, + "step": 138 + }, + { + "epoch": 2.2432432432432434, + "grad_norm": 0.45092299580574036, + "learning_rate": 4.739454418273314e-06, + "loss": 0.0281, + "step": 139 + }, + { + "epoch": 2.2594594594594595, + "grad_norm": 0.48173126578330994, + "learning_rate": 4.726174293673612e-06, + "loss": 0.0213, + "step": 140 + }, + { + "epoch": 2.2756756756756755, + "grad_norm": 0.48536229133605957, + "learning_rate": 4.712583648351827e-06, + "loss": 0.0204, + "step": 141 + }, + { + "epoch": 2.291891891891892, + "grad_norm": 0.4885499179363251, + "learning_rate": 4.698684378016223e-06, + "loss": 0.0225, + "step": 142 + }, + { + "epoch": 2.308108108108108, + "grad_norm": 0.40719687938690186, + "learning_rate": 4.684478421424007e-06, + "loss": 0.0205, + "step": 143 + }, + { + "epoch": 2.3243243243243246, + "grad_norm": 0.4365272521972656, + "learning_rate": 4.669967760110908e-06, + "loss": 0.0224, + "step": 144 + }, + { + "epoch": 2.3405405405405406, + "grad_norm": 0.4639301300048828, + "learning_rate": 4.655154418114774e-06, + "loss": 0.0256, + "step": 145 + }, + { + "epoch": 2.3567567567567567, + "grad_norm": 0.47420835494995117, + "learning_rate": 4.6400404616932505e-06, + "loss": 0.0208, + "step": 146 + }, + { + "epoch": 2.372972972972973, + "grad_norm": 0.5030474066734314, + "learning_rate": 4.624627999035564e-06, + "loss": 0.0255, + "step": 147 + }, + { + "epoch": 2.389189189189189, + "grad_norm": 0.47888803482055664, + "learning_rate": 4.608919179968457e-06, + "loss": 0.0241, + "step": 148 + }, + { + "epoch": 2.4054054054054053, + "grad_norm": 0.602581262588501, + "learning_rate": 4.592916195656322e-06, + "loss": 0.0243, + "step": 149 + }, + { + "epoch": 2.4216216216216218, + "grad_norm": 0.6816417574882507, + "learning_rate": 4.576621278295558e-06, + "loss": 0.0259, + "step": 150 + }, + { + "epoch": 2.437837837837838, + "grad_norm": 0.6839447617530823, + "learning_rate": 4.5600367008032135e-06, + "loss": 0.0247, + "step": 151 + }, + { + "epoch": 2.454054054054054, + "grad_norm": 0.496794193983078, + "learning_rate": 4.543164776499945e-06, + "loss": 0.0244, + "step": 152 + }, + { + "epoch": 2.4702702702702704, + "grad_norm": 0.4372956156730652, + "learning_rate": 4.5260078587873416e-06, + "loss": 0.0208, + "step": 153 + }, + { + "epoch": 2.4864864864864864, + "grad_norm": 0.6199434399604797, + "learning_rate": 4.508568340819654e-06, + "loss": 0.028, + "step": 154 + }, + { + "epoch": 2.5027027027027025, + "grad_norm": 0.6074104905128479, + "learning_rate": 4.490848655169986e-06, + "loss": 0.0278, + "step": 155 + }, + { + "epoch": 2.518918918918919, + "grad_norm": 0.5419324636459351, + "learning_rate": 4.472851273490985e-06, + "loss": 0.0181, + "step": 156 + }, + { + "epoch": 2.535135135135135, + "grad_norm": 0.4877943992614746, + "learning_rate": 4.454578706170075e-06, + "loss": 0.0214, + "step": 157 + }, + { + "epoch": 2.5513513513513515, + "grad_norm": 0.5049244165420532, + "learning_rate": 4.436033501979299e-06, + "loss": 0.0214, + "step": 158 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.45928511023521423, + "learning_rate": 4.417218247719794e-06, + "loss": 0.0167, + "step": 159 + }, + { + "epoch": 2.583783783783784, + "grad_norm": 0.5185860395431519, + "learning_rate": 4.398135567860972e-06, + "loss": 0.0243, + "step": 160 + }, + { + "epoch": 2.6, + "grad_norm": 0.3984812796115875, + "learning_rate": 4.378788124174441e-06, + "loss": 0.0201, + "step": 161 + }, + { + "epoch": 2.616216216216216, + "grad_norm": 0.607692301273346, + "learning_rate": 4.359178615362725e-06, + "loss": 0.0247, + "step": 162 + }, + { + "epoch": 2.6324324324324326, + "grad_norm": 0.5436367988586426, + "learning_rate": 4.33930977668283e-06, + "loss": 0.0204, + "step": 163 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 0.6367728114128113, + "learning_rate": 4.319184379564716e-06, + "loss": 0.0222, + "step": 164 + }, + { + "epoch": 2.6648648648648647, + "grad_norm": 0.5538708567619324, + "learning_rate": 4.298805231224721e-06, + "loss": 0.0215, + "step": 165 + }, + { + "epoch": 2.6810810810810812, + "grad_norm": 0.5421778559684753, + "learning_rate": 4.278175174273989e-06, + "loss": 0.0201, + "step": 166 + }, + { + "epoch": 2.6972972972972973, + "grad_norm": 0.6123104691505432, + "learning_rate": 4.257297086321967e-06, + "loss": 0.0209, + "step": 167 + }, + { + "epoch": 2.7135135135135133, + "grad_norm": 0.6386194229125977, + "learning_rate": 4.236173879575022e-06, + "loss": 0.0214, + "step": 168 + }, + { + "epoch": 2.72972972972973, + "grad_norm": 0.6019391417503357, + "learning_rate": 4.2148085004302205e-06, + "loss": 0.0246, + "step": 169 + }, + { + "epoch": 2.745945945945946, + "grad_norm": 0.5638225674629211, + "learning_rate": 4.1932039290643534e-06, + "loss": 0.0189, + "step": 170 + }, + { + "epoch": 2.762162162162162, + "grad_norm": 0.6640142202377319, + "learning_rate": 4.1713631790182366e-06, + "loss": 0.0236, + "step": 171 + }, + { + "epoch": 2.7783783783783784, + "grad_norm": 0.5170625448226929, + "learning_rate": 4.149289296776369e-06, + "loss": 0.0203, + "step": 172 + }, + { + "epoch": 2.7945945945945945, + "grad_norm": 0.5130777955055237, + "learning_rate": 4.126985361341984e-06, + "loss": 0.0195, + "step": 173 + }, + { + "epoch": 2.810810810810811, + "grad_norm": 0.5125660300254822, + "learning_rate": 4.104454483807579e-06, + "loss": 0.0229, + "step": 174 + }, + { + "epoch": 2.827027027027027, + "grad_norm": 0.573662281036377, + "learning_rate": 4.0816998069209516e-06, + "loss": 0.0202, + "step": 175 + }, + { + "epoch": 2.8432432432432435, + "grad_norm": 0.6013869643211365, + "learning_rate": 4.058724504646834e-06, + "loss": 0.0319, + "step": 176 + }, + { + "epoch": 2.8594594594594596, + "grad_norm": 0.5050269365310669, + "learning_rate": 4.0355317817241705e-06, + "loss": 0.0189, + "step": 177 + }, + { + "epoch": 2.8756756756756756, + "grad_norm": 0.5249143838882446, + "learning_rate": 4.012124873219094e-06, + "loss": 0.0214, + "step": 178 + }, + { + "epoch": 2.891891891891892, + "grad_norm": 0.5124053955078125, + "learning_rate": 3.988507044073687e-06, + "loss": 0.0162, + "step": 179 + }, + { + "epoch": 2.908108108108108, + "grad_norm": 0.4640377461910248, + "learning_rate": 3.964681588650562e-06, + "loss": 0.0189, + "step": 180 + }, + { + "epoch": 2.924324324324324, + "grad_norm": 0.6197820901870728, + "learning_rate": 3.940651830273342e-06, + "loss": 0.0237, + "step": 181 + }, + { + "epoch": 2.9405405405405407, + "grad_norm": 0.6041496992111206, + "learning_rate": 3.916421120763106e-06, + "loss": 0.0241, + "step": 182 + }, + { + "epoch": 2.9567567567567568, + "grad_norm": 0.5259250402450562, + "learning_rate": 3.891992839970855e-06, + "loss": 0.0207, + "step": 183 + } + ], + "logging_steps": 1, + "max_steps": 366, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 61, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.5973867881037824e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-183/training_args.bin b/checkpoint-183/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5b28d107f55169977eced33ac6929abb398bb2c5 --- /dev/null +++ b/checkpoint-183/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e2f1aaf0f48ae52048eea3703205522237e597bd418f53d57d152ef3ad9cbbc +size 8056 diff --git a/checkpoint-183/zero_to_fp32.py b/checkpoint-183/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-183/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-244/README.md b/checkpoint-244/README.md new file mode 100644 index 0000000000000000000000000000000000000000..be5c87703f12b547886cc6a2ecfbe9ee150496fa --- /dev/null +++ b/checkpoint-244/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-244/adapter_config.json b/checkpoint-244/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aaa71b6240dcb4147fb982eb2f0ff89574c4fb31 --- /dev/null +++ b/checkpoint-244/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "up_proj", + "gate_proj", + "v_proj", + "down_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-244/adapter_model.safetensors b/checkpoint-244/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c47beca20cadc51efcaf640819009cdbc4f6484a --- /dev/null +++ b/checkpoint-244/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db5ae2d10ebc549e5d7e162fa7eb437dab66b11755c9dc4975eac0ee8a86ac4e +size 3443586272 diff --git a/checkpoint-244/global_step243/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-244/global_step243/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f42810b83687e63dadf4de18139210ee68f2d34 --- /dev/null +++ b/checkpoint-244/global_step243/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a01483cc0f00bcb7a761b972b7ee30c40dbbc2a716cae2af77bb7b07ca2087ed +size 20661195036 diff --git a/checkpoint-244/global_step243/mp_rank_00_model_states.pt b/checkpoint-244/global_step243/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d37e34c3a560ea365c095cd1eea7caa9a7d09df1 --- /dev/null +++ b/checkpoint-244/global_step243/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c565cae78f597ed8851712f1ec400061fef9f4e1f747a373ae8afe511f105069 +size 3555326649 diff --git a/checkpoint-244/latest b/checkpoint-244/latest new file mode 100644 index 0000000000000000000000000000000000000000..2060374fe1e21c009bc46302652418d2d95ab705 --- /dev/null +++ b/checkpoint-244/latest @@ -0,0 +1 @@ +global_step243 \ No newline at end of file diff --git a/checkpoint-244/rng_state.pth b/checkpoint-244/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7d7db6392b22d690640cccc8dd0a60bc05615926 --- /dev/null +++ b/checkpoint-244/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3826efaf0684e2f6b9c4396e4b223310c6160e5eb6fe1184ae09e364c6e176d +size 14244 diff --git a/checkpoint-244/scheduler.pt b/checkpoint-244/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..af4d3eb478a3dfac8e2b949539dee37e17569d82 --- /dev/null +++ b/checkpoint-244/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0846800f5bb7abb340a8341a16a086f1a97a0c762bfaed049cccff6bf7bea00d +size 1064 diff --git a/checkpoint-244/special_tokens_map.json b/checkpoint-244/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-244/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-244/tokenizer.json b/checkpoint-244/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-244/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-244/tokenizer_config.json b/checkpoint-244/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-244/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-244/trainer_state.json b/checkpoint-244/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2b875c24ba94c0d79748870b975bcd69606b2e35 --- /dev/null +++ b/checkpoint-244/trainer_state.json @@ -0,0 +1,1741 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.9405405405405407, + "eval_steps": 500, + "global_step": 244, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016216216216216217, + "grad_norm": 39.12052917480469, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2957, + "step": 1 + }, + { + "epoch": 0.032432432432432434, + "grad_norm": 38.9581413269043, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.2959, + "step": 2 + }, + { + "epoch": 0.04864864864864865, + "grad_norm": 39.2702751159668, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.2677, + "step": 3 + }, + { + "epoch": 0.06486486486486487, + "grad_norm": 39.18815231323242, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.2936, + "step": 4 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 38.66701889038086, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.2561, + "step": 5 + }, + { + "epoch": 0.0972972972972973, + "grad_norm": 39.53536605834961, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.2579, + "step": 6 + }, + { + "epoch": 0.11351351351351352, + "grad_norm": 39.3793830871582, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.2627, + "step": 7 + }, + { + "epoch": 0.12972972972972974, + "grad_norm": 39.88922119140625, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.2729, + "step": 8 + }, + { + "epoch": 0.14594594594594595, + "grad_norm": 37.9880256652832, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.2311, + "step": 9 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 37.024139404296875, + "learning_rate": 5.000000000000001e-07, + "loss": 2.1773, + "step": 10 + }, + { + "epoch": 0.1783783783783784, + "grad_norm": 36.89325714111328, + "learning_rate": 5.5e-07, + "loss": 2.1927, + "step": 11 + }, + { + "epoch": 0.1945945945945946, + "grad_norm": 37.244178771972656, + "learning_rate": 6.000000000000001e-07, + "loss": 2.1757, + "step": 12 + }, + { + "epoch": 0.21081081081081082, + "grad_norm": 34.77650451660156, + "learning_rate": 6.5e-07, + "loss": 2.0392, + "step": 13 + }, + { + "epoch": 0.22702702702702704, + "grad_norm": 34.78818893432617, + "learning_rate": 7.000000000000001e-07, + "loss": 1.9996, + "step": 14 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 34.86852264404297, + "learning_rate": 7.5e-07, + "loss": 1.9496, + "step": 15 + }, + { + "epoch": 0.2594594594594595, + "grad_norm": 35.202796936035156, + "learning_rate": 8.000000000000001e-07, + "loss": 1.8542, + "step": 16 + }, + { + "epoch": 0.2756756756756757, + "grad_norm": 34.11354064941406, + "learning_rate": 8.500000000000001e-07, + "loss": 1.7118, + "step": 17 + }, + { + "epoch": 0.2918918918918919, + "grad_norm": 36.309059143066406, + "learning_rate": 9.000000000000001e-07, + "loss": 1.6834, + "step": 18 + }, + { + "epoch": 0.3081081081081081, + "grad_norm": 34.69994354248047, + "learning_rate": 9.500000000000001e-07, + "loss": 1.5298, + "step": 19 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 35.43153381347656, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.4191, + "step": 20 + }, + { + "epoch": 0.34054054054054056, + "grad_norm": 33.53745651245117, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.3068, + "step": 21 + }, + { + "epoch": 0.3567567567567568, + "grad_norm": 33.775604248046875, + "learning_rate": 1.1e-06, + "loss": 1.224, + "step": 22 + }, + { + "epoch": 0.372972972972973, + "grad_norm": 30.57005500793457, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.0704, + "step": 23 + }, + { + "epoch": 0.3891891891891892, + "grad_norm": 27.964860916137695, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.9548, + "step": 24 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 26.023576736450195, + "learning_rate": 1.25e-06, + "loss": 0.8503, + "step": 25 + }, + { + "epoch": 0.42162162162162165, + "grad_norm": 25.0452938079834, + "learning_rate": 1.3e-06, + "loss": 0.6938, + "step": 26 + }, + { + "epoch": 0.43783783783783786, + "grad_norm": 24.663373947143555, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.5648, + "step": 27 + }, + { + "epoch": 0.4540540540540541, + "grad_norm": 21.61736488342285, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.435, + "step": 28 + }, + { + "epoch": 0.4702702702702703, + "grad_norm": 18.3259334564209, + "learning_rate": 1.45e-06, + "loss": 0.3322, + "step": 29 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 16.80081558227539, + "learning_rate": 1.5e-06, + "loss": 0.2625, + "step": 30 + }, + { + "epoch": 0.5027027027027027, + "grad_norm": 14.789258003234863, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1757, + "step": 31 + }, + { + "epoch": 0.518918918918919, + "grad_norm": 10.406538963317871, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1376, + "step": 32 + }, + { + "epoch": 0.5351351351351351, + "grad_norm": 4.868802547454834, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0815, + "step": 33 + }, + { + "epoch": 0.5513513513513514, + "grad_norm": 1.8639686107635498, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0628, + "step": 34 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 1.897918462753296, + "learning_rate": 1.75e-06, + "loss": 0.0775, + "step": 35 + }, + { + "epoch": 0.5837837837837838, + "grad_norm": 1.296712040901184, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0565, + "step": 36 + }, + { + "epoch": 0.6, + "grad_norm": 1.0163214206695557, + "learning_rate": 1.85e-06, + "loss": 0.0544, + "step": 37 + }, + { + "epoch": 0.6162162162162163, + "grad_norm": 1.070162296295166, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0621, + "step": 38 + }, + { + "epoch": 0.6324324324324324, + "grad_norm": 1.024267315864563, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0566, + "step": 39 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.9016611576080322, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0511, + "step": 40 + }, + { + "epoch": 0.6648648648648648, + "grad_norm": 0.8272562623023987, + "learning_rate": 2.05e-06, + "loss": 0.0533, + "step": 41 + }, + { + "epoch": 0.6810810810810811, + "grad_norm": 0.8875278234481812, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0495, + "step": 42 + }, + { + "epoch": 0.6972972972972973, + "grad_norm": 0.8804877996444702, + "learning_rate": 2.15e-06, + "loss": 0.0506, + "step": 43 + }, + { + "epoch": 0.7135135135135136, + "grad_norm": 0.7133358120918274, + "learning_rate": 2.2e-06, + "loss": 0.0467, + "step": 44 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 0.8142214417457581, + "learning_rate": 2.25e-06, + "loss": 0.0552, + "step": 45 + }, + { + "epoch": 0.745945945945946, + "grad_norm": 0.8341564536094666, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0574, + "step": 46 + }, + { + "epoch": 0.7621621621621621, + "grad_norm": 0.6500507593154907, + "learning_rate": 2.35e-06, + "loss": 0.0398, + "step": 47 + }, + { + "epoch": 0.7783783783783784, + "grad_norm": 0.6163598895072937, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0459, + "step": 48 + }, + { + "epoch": 0.7945945945945946, + "grad_norm": 0.663949191570282, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.046, + "step": 49 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7521553635597229, + "learning_rate": 2.5e-06, + "loss": 0.0525, + "step": 50 + }, + { + "epoch": 0.827027027027027, + "grad_norm": 0.7828383445739746, + "learning_rate": 2.55e-06, + "loss": 0.0558, + "step": 51 + }, + { + "epoch": 0.8432432432432433, + "grad_norm": 0.7935078740119934, + "learning_rate": 2.6e-06, + "loss": 0.0451, + "step": 52 + }, + { + "epoch": 0.8594594594594595, + "grad_norm": 0.6327880620956421, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0403, + "step": 53 + }, + { + "epoch": 0.8756756756756757, + "grad_norm": 0.6185981035232544, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0406, + "step": 54 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 0.5417979955673218, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0426, + "step": 55 + }, + { + "epoch": 0.9081081081081082, + "grad_norm": 0.7140630483627319, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0446, + "step": 56 + }, + { + "epoch": 0.9243243243243243, + "grad_norm": 0.7191944122314453, + "learning_rate": 2.85e-06, + "loss": 0.047, + "step": 57 + }, + { + "epoch": 0.9405405405405406, + "grad_norm": 0.7562940716743469, + "learning_rate": 2.9e-06, + "loss": 0.0476, + "step": 58 + }, + { + "epoch": 0.9567567567567568, + "grad_norm": 0.7422239184379578, + "learning_rate": 2.95e-06, + "loss": 0.0462, + "step": 59 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 0.677144467830658, + "learning_rate": 3e-06, + "loss": 0.0475, + "step": 60 + }, + { + "epoch": 0.9891891891891892, + "grad_norm": 0.6127192974090576, + "learning_rate": 3.05e-06, + "loss": 0.0434, + "step": 61 + }, + { + "epoch": 1.0, + "grad_norm": 0.6127192974090576, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0375, + "step": 62 + }, + { + "epoch": 1.0162162162162163, + "grad_norm": 0.959559440612793, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0421, + "step": 63 + }, + { + "epoch": 1.0324324324324325, + "grad_norm": 0.6539880037307739, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0414, + "step": 64 + }, + { + "epoch": 1.0486486486486486, + "grad_norm": 0.5929313898086548, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0451, + "step": 65 + }, + { + "epoch": 1.0648648648648649, + "grad_norm": 0.6479571461677551, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0415, + "step": 66 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.5496926307678223, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0366, + "step": 67 + }, + { + "epoch": 1.0972972972972972, + "grad_norm": 0.5373682379722595, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0383, + "step": 68 + }, + { + "epoch": 1.1135135135135135, + "grad_norm": 0.5489712357521057, + "learning_rate": 3.45e-06, + "loss": 0.0427, + "step": 69 + }, + { + "epoch": 1.1297297297297297, + "grad_norm": 0.6830047369003296, + "learning_rate": 3.5e-06, + "loss": 0.039, + "step": 70 + }, + { + "epoch": 1.145945945945946, + "grad_norm": 0.5794199705123901, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0409, + "step": 71 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 0.571513831615448, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0392, + "step": 72 + }, + { + "epoch": 1.1783783783783783, + "grad_norm": 0.7753933668136597, + "learning_rate": 3.65e-06, + "loss": 0.0365, + "step": 73 + }, + { + "epoch": 1.1945945945945946, + "grad_norm": 0.6135310530662537, + "learning_rate": 3.7e-06, + "loss": 0.036, + "step": 74 + }, + { + "epoch": 1.2108108108108109, + "grad_norm": 0.5497344136238098, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.035, + "step": 75 + }, + { + "epoch": 1.227027027027027, + "grad_norm": 0.5861782431602478, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0434, + "step": 76 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 0.6941010355949402, + "learning_rate": 3.85e-06, + "loss": 0.0336, + "step": 77 + }, + { + "epoch": 1.2594594594594595, + "grad_norm": 0.5305830240249634, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0391, + "step": 78 + }, + { + "epoch": 1.2756756756756757, + "grad_norm": 0.6456385254859924, + "learning_rate": 3.95e-06, + "loss": 0.0422, + "step": 79 + }, + { + "epoch": 1.291891891891892, + "grad_norm": 0.5704363584518433, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0342, + "step": 80 + }, + { + "epoch": 1.308108108108108, + "grad_norm": 0.5257390141487122, + "learning_rate": 4.05e-06, + "loss": 0.0369, + "step": 81 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 0.5541989207267761, + "learning_rate": 4.1e-06, + "loss": 0.0331, + "step": 82 + }, + { + "epoch": 1.3405405405405406, + "grad_norm": 0.7190688252449036, + "learning_rate": 4.15e-06, + "loss": 0.039, + "step": 83 + }, + { + "epoch": 1.3567567567567567, + "grad_norm": 0.4766721725463867, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0354, + "step": 84 + }, + { + "epoch": 1.372972972972973, + "grad_norm": 0.5847981572151184, + "learning_rate": 4.25e-06, + "loss": 0.0355, + "step": 85 + }, + { + "epoch": 1.3891891891891892, + "grad_norm": 0.6361181139945984, + "learning_rate": 4.3e-06, + "loss": 0.0415, + "step": 86 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 0.6437036395072937, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0353, + "step": 87 + }, + { + "epoch": 1.4216216216216218, + "grad_norm": 0.712043046951294, + "learning_rate": 4.4e-06, + "loss": 0.0311, + "step": 88 + }, + { + "epoch": 1.4378378378378378, + "grad_norm": 0.5829771757125854, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0433, + "step": 89 + }, + { + "epoch": 1.454054054054054, + "grad_norm": 0.6977937817573547, + "learning_rate": 4.5e-06, + "loss": 0.0391, + "step": 90 + }, + { + "epoch": 1.4702702702702704, + "grad_norm": 0.49931228160858154, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0352, + "step": 91 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.5281490683555603, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0385, + "step": 92 + }, + { + "epoch": 1.5027027027027027, + "grad_norm": 0.613349974155426, + "learning_rate": 4.65e-06, + "loss": 0.0399, + "step": 93 + }, + { + "epoch": 1.518918918918919, + "grad_norm": 0.6584879755973816, + "learning_rate": 4.7e-06, + "loss": 0.043, + "step": 94 + }, + { + "epoch": 1.535135135135135, + "grad_norm": 0.6006895303726196, + "learning_rate": 4.75e-06, + "loss": 0.0372, + "step": 95 + }, + { + "epoch": 1.5513513513513515, + "grad_norm": 0.5364943146705627, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0384, + "step": 96 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 0.4963968098163605, + "learning_rate": 4.85e-06, + "loss": 0.0324, + "step": 97 + }, + { + "epoch": 1.5837837837837838, + "grad_norm": 0.5868538618087769, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0386, + "step": 98 + }, + { + "epoch": 1.6, + "grad_norm": 0.6690974235534668, + "learning_rate": 4.95e-06, + "loss": 0.0332, + "step": 99 + }, + { + "epoch": 1.6162162162162161, + "grad_norm": 0.6118388175964355, + "learning_rate": 5e-06, + "loss": 0.0398, + "step": 100 + }, + { + "epoch": 1.6324324324324324, + "grad_norm": 0.6872304677963257, + "learning_rate": 4.999825642177387e-06, + "loss": 0.0333, + "step": 101 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 0.6457200646400452, + "learning_rate": 4.999302593030069e-06, + "loss": 0.0381, + "step": 102 + }, + { + "epoch": 1.6648648648648647, + "grad_norm": 0.6096416115760803, + "learning_rate": 4.998430925516213e-06, + "loss": 0.0385, + "step": 103 + }, + { + "epoch": 1.6810810810810812, + "grad_norm": 0.582796573638916, + "learning_rate": 4.99721076122146e-06, + "loss": 0.0317, + "step": 104 + }, + { + "epoch": 1.6972972972972973, + "grad_norm": 0.5576394200325012, + "learning_rate": 4.995642270341961e-06, + "loss": 0.0378, + "step": 105 + }, + { + "epoch": 1.7135135135135136, + "grad_norm": 0.7414760589599609, + "learning_rate": 4.99372567166064e-06, + "loss": 0.0403, + "step": 106 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 0.6029103994369507, + "learning_rate": 4.991461232516675e-06, + "loss": 0.0418, + "step": 107 + }, + { + "epoch": 1.7459459459459459, + "grad_norm": 0.771609365940094, + "learning_rate": 4.98884926876821e-06, + "loss": 0.0413, + "step": 108 + }, + { + "epoch": 1.7621621621621621, + "grad_norm": 0.6869891285896301, + "learning_rate": 4.9858901447482924e-06, + "loss": 0.0367, + "step": 109 + }, + { + "epoch": 1.7783783783783784, + "grad_norm": 0.4931647479534149, + "learning_rate": 4.982584273214061e-06, + "loss": 0.033, + "step": 110 + }, + { + "epoch": 1.7945945945945945, + "grad_norm": 0.5160052180290222, + "learning_rate": 4.978932115289165e-06, + "loss": 0.0357, + "step": 111 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 0.49750861525535583, + "learning_rate": 4.974934180399447e-06, + "loss": 0.0333, + "step": 112 + }, + { + "epoch": 1.827027027027027, + "grad_norm": 0.6596441864967346, + "learning_rate": 4.970591026201884e-06, + "loss": 0.0354, + "step": 113 + }, + { + "epoch": 1.8432432432432433, + "grad_norm": 0.6613579988479614, + "learning_rate": 4.965903258506806e-06, + "loss": 0.0377, + "step": 114 + }, + { + "epoch": 1.8594594594594596, + "grad_norm": 0.5383866429328918, + "learning_rate": 4.9608715311933865e-06, + "loss": 0.0418, + "step": 115 + }, + { + "epoch": 1.8756756756756756, + "grad_norm": 0.6303413510322571, + "learning_rate": 4.955496546118439e-06, + "loss": 0.0351, + "step": 116 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.5293605923652649, + "learning_rate": 4.949779053018519e-06, + "loss": 0.0322, + "step": 117 + }, + { + "epoch": 1.9081081081081082, + "grad_norm": 0.5211143493652344, + "learning_rate": 4.943719849405347e-06, + "loss": 0.0374, + "step": 118 + }, + { + "epoch": 1.9243243243243242, + "grad_norm": 0.5933778882026672, + "learning_rate": 4.937319780454559e-06, + "loss": 0.0377, + "step": 119 + }, + { + "epoch": 1.9405405405405407, + "grad_norm": 0.6020687818527222, + "learning_rate": 4.930579738887827e-06, + "loss": 0.0313, + "step": 120 + }, + { + "epoch": 1.9567567567567568, + "grad_norm": 0.7828154563903809, + "learning_rate": 4.923500664848327e-06, + "loss": 0.0372, + "step": 121 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 0.6172424554824829, + "learning_rate": 4.9160835457696075e-06, + "loss": 0.0387, + "step": 122 + }, + { + "epoch": 1.9891891891891893, + "grad_norm": 0.5671921372413635, + "learning_rate": 4.9083294162378545e-06, + "loss": 0.0346, + "step": 123 + }, + { + "epoch": 2.0, + "grad_norm": 1.0704405307769775, + "learning_rate": 4.900239357847582e-06, + "loss": 0.0298, + "step": 124 + }, + { + "epoch": 2.016216216216216, + "grad_norm": 0.5932011604309082, + "learning_rate": 4.891814499050762e-06, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 2.0324324324324325, + "grad_norm": 0.47397834062576294, + "learning_rate": 4.883056014999423e-06, + "loss": 0.0281, + "step": 126 + }, + { + "epoch": 2.0486486486486486, + "grad_norm": 0.538270115852356, + "learning_rate": 4.873965127381734e-06, + "loss": 0.0268, + "step": 127 + }, + { + "epoch": 2.064864864864865, + "grad_norm": 0.3924686908721924, + "learning_rate": 4.864543104251587e-06, + "loss": 0.02, + "step": 128 + }, + { + "epoch": 2.081081081081081, + "grad_norm": 0.5162842869758606, + "learning_rate": 4.854791259851735e-06, + "loss": 0.0237, + "step": 129 + }, + { + "epoch": 2.097297297297297, + "grad_norm": 0.4691126048564911, + "learning_rate": 4.844710954430464e-06, + "loss": 0.0224, + "step": 130 + }, + { + "epoch": 2.1135135135135137, + "grad_norm": 0.47650063037872314, + "learning_rate": 4.834303594051854e-06, + "loss": 0.0202, + "step": 131 + }, + { + "epoch": 2.1297297297297297, + "grad_norm": 0.5041627883911133, + "learning_rate": 4.823570630399665e-06, + "loss": 0.0228, + "step": 132 + }, + { + "epoch": 2.145945945945946, + "grad_norm": 0.5368483662605286, + "learning_rate": 4.812513560574832e-06, + "loss": 0.0241, + "step": 133 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.7245975732803345, + "learning_rate": 4.8011339268866505e-06, + "loss": 0.0294, + "step": 134 + }, + { + "epoch": 2.1783783783783783, + "grad_norm": 0.5506283044815063, + "learning_rate": 4.789433316637644e-06, + "loss": 0.0204, + "step": 135 + }, + { + "epoch": 2.1945945945945944, + "grad_norm": 0.5240617990493774, + "learning_rate": 4.777413361902152e-06, + "loss": 0.0227, + "step": 136 + }, + { + "epoch": 2.210810810810811, + "grad_norm": 0.6164438128471375, + "learning_rate": 4.765075739298683e-06, + "loss": 0.0209, + "step": 137 + }, + { + "epoch": 2.227027027027027, + "grad_norm": 0.551898181438446, + "learning_rate": 4.752422169756048e-06, + "loss": 0.0187, + "step": 138 + }, + { + "epoch": 2.2432432432432434, + "grad_norm": 0.45092299580574036, + "learning_rate": 4.739454418273314e-06, + "loss": 0.0281, + "step": 139 + }, + { + "epoch": 2.2594594594594595, + "grad_norm": 0.48173126578330994, + "learning_rate": 4.726174293673612e-06, + "loss": 0.0213, + "step": 140 + }, + { + "epoch": 2.2756756756756755, + "grad_norm": 0.48536229133605957, + "learning_rate": 4.712583648351827e-06, + "loss": 0.0204, + "step": 141 + }, + { + "epoch": 2.291891891891892, + "grad_norm": 0.4885499179363251, + "learning_rate": 4.698684378016223e-06, + "loss": 0.0225, + "step": 142 + }, + { + "epoch": 2.308108108108108, + "grad_norm": 0.40719687938690186, + "learning_rate": 4.684478421424007e-06, + "loss": 0.0205, + "step": 143 + }, + { + "epoch": 2.3243243243243246, + "grad_norm": 0.4365272521972656, + "learning_rate": 4.669967760110908e-06, + "loss": 0.0224, + "step": 144 + }, + { + "epoch": 2.3405405405405406, + "grad_norm": 0.4639301300048828, + "learning_rate": 4.655154418114774e-06, + "loss": 0.0256, + "step": 145 + }, + { + "epoch": 2.3567567567567567, + "grad_norm": 0.47420835494995117, + "learning_rate": 4.6400404616932505e-06, + "loss": 0.0208, + "step": 146 + }, + { + "epoch": 2.372972972972973, + "grad_norm": 0.5030474066734314, + "learning_rate": 4.624627999035564e-06, + "loss": 0.0255, + "step": 147 + }, + { + "epoch": 2.389189189189189, + "grad_norm": 0.47888803482055664, + "learning_rate": 4.608919179968457e-06, + "loss": 0.0241, + "step": 148 + }, + { + "epoch": 2.4054054054054053, + "grad_norm": 0.602581262588501, + "learning_rate": 4.592916195656322e-06, + "loss": 0.0243, + "step": 149 + }, + { + "epoch": 2.4216216216216218, + "grad_norm": 0.6816417574882507, + "learning_rate": 4.576621278295558e-06, + "loss": 0.0259, + "step": 150 + }, + { + "epoch": 2.437837837837838, + "grad_norm": 0.6839447617530823, + "learning_rate": 4.5600367008032135e-06, + "loss": 0.0247, + "step": 151 + }, + { + "epoch": 2.454054054054054, + "grad_norm": 0.496794193983078, + "learning_rate": 4.543164776499945e-06, + "loss": 0.0244, + "step": 152 + }, + { + "epoch": 2.4702702702702704, + "grad_norm": 0.4372956156730652, + "learning_rate": 4.5260078587873416e-06, + "loss": 0.0208, + "step": 153 + }, + { + "epoch": 2.4864864864864864, + "grad_norm": 0.6199434399604797, + "learning_rate": 4.508568340819654e-06, + "loss": 0.028, + "step": 154 + }, + { + "epoch": 2.5027027027027025, + "grad_norm": 0.6074104905128479, + "learning_rate": 4.490848655169986e-06, + "loss": 0.0278, + "step": 155 + }, + { + "epoch": 2.518918918918919, + "grad_norm": 0.5419324636459351, + "learning_rate": 4.472851273490985e-06, + "loss": 0.0181, + "step": 156 + }, + { + "epoch": 2.535135135135135, + "grad_norm": 0.4877943992614746, + "learning_rate": 4.454578706170075e-06, + "loss": 0.0214, + "step": 157 + }, + { + "epoch": 2.5513513513513515, + "grad_norm": 0.5049244165420532, + "learning_rate": 4.436033501979299e-06, + "loss": 0.0214, + "step": 158 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.45928511023521423, + "learning_rate": 4.417218247719794e-06, + "loss": 0.0167, + "step": 159 + }, + { + "epoch": 2.583783783783784, + "grad_norm": 0.5185860395431519, + "learning_rate": 4.398135567860972e-06, + "loss": 0.0243, + "step": 160 + }, + { + "epoch": 2.6, + "grad_norm": 0.3984812796115875, + "learning_rate": 4.378788124174441e-06, + "loss": 0.0201, + "step": 161 + }, + { + "epoch": 2.616216216216216, + "grad_norm": 0.607692301273346, + "learning_rate": 4.359178615362725e-06, + "loss": 0.0247, + "step": 162 + }, + { + "epoch": 2.6324324324324326, + "grad_norm": 0.5436367988586426, + "learning_rate": 4.33930977668283e-06, + "loss": 0.0204, + "step": 163 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 0.6367728114128113, + "learning_rate": 4.319184379564716e-06, + "loss": 0.0222, + "step": 164 + }, + { + "epoch": 2.6648648648648647, + "grad_norm": 0.5538708567619324, + "learning_rate": 4.298805231224721e-06, + "loss": 0.0215, + "step": 165 + }, + { + "epoch": 2.6810810810810812, + "grad_norm": 0.5421778559684753, + "learning_rate": 4.278175174273989e-06, + "loss": 0.0201, + "step": 166 + }, + { + "epoch": 2.6972972972972973, + "grad_norm": 0.6123104691505432, + "learning_rate": 4.257297086321967e-06, + "loss": 0.0209, + "step": 167 + }, + { + "epoch": 2.7135135135135133, + "grad_norm": 0.6386194229125977, + "learning_rate": 4.236173879575022e-06, + "loss": 0.0214, + "step": 168 + }, + { + "epoch": 2.72972972972973, + "grad_norm": 0.6019391417503357, + "learning_rate": 4.2148085004302205e-06, + "loss": 0.0246, + "step": 169 + }, + { + "epoch": 2.745945945945946, + "grad_norm": 0.5638225674629211, + "learning_rate": 4.1932039290643534e-06, + "loss": 0.0189, + "step": 170 + }, + { + "epoch": 2.762162162162162, + "grad_norm": 0.6640142202377319, + "learning_rate": 4.1713631790182366e-06, + "loss": 0.0236, + "step": 171 + }, + { + "epoch": 2.7783783783783784, + "grad_norm": 0.5170625448226929, + "learning_rate": 4.149289296776369e-06, + "loss": 0.0203, + "step": 172 + }, + { + "epoch": 2.7945945945945945, + "grad_norm": 0.5130777955055237, + "learning_rate": 4.126985361341984e-06, + "loss": 0.0195, + "step": 173 + }, + { + "epoch": 2.810810810810811, + "grad_norm": 0.5125660300254822, + "learning_rate": 4.104454483807579e-06, + "loss": 0.0229, + "step": 174 + }, + { + "epoch": 2.827027027027027, + "grad_norm": 0.573662281036377, + "learning_rate": 4.0816998069209516e-06, + "loss": 0.0202, + "step": 175 + }, + { + "epoch": 2.8432432432432435, + "grad_norm": 0.6013869643211365, + "learning_rate": 4.058724504646834e-06, + "loss": 0.0319, + "step": 176 + }, + { + "epoch": 2.8594594594594596, + "grad_norm": 0.5050269365310669, + "learning_rate": 4.0355317817241705e-06, + "loss": 0.0189, + "step": 177 + }, + { + "epoch": 2.8756756756756756, + "grad_norm": 0.5249143838882446, + "learning_rate": 4.012124873219094e-06, + "loss": 0.0214, + "step": 178 + }, + { + "epoch": 2.891891891891892, + "grad_norm": 0.5124053955078125, + "learning_rate": 3.988507044073687e-06, + "loss": 0.0162, + "step": 179 + }, + { + "epoch": 2.908108108108108, + "grad_norm": 0.4640377461910248, + "learning_rate": 3.964681588650562e-06, + "loss": 0.0189, + "step": 180 + }, + { + "epoch": 2.924324324324324, + "grad_norm": 0.6197820901870728, + "learning_rate": 3.940651830273342e-06, + "loss": 0.0237, + "step": 181 + }, + { + "epoch": 2.9405405405405407, + "grad_norm": 0.6041496992111206, + "learning_rate": 3.916421120763106e-06, + "loss": 0.0241, + "step": 182 + }, + { + "epoch": 2.9567567567567568, + "grad_norm": 0.5259250402450562, + "learning_rate": 3.891992839970855e-06, + "loss": 0.0207, + "step": 183 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.6110473871231079, + "learning_rate": 3.8673703953060685e-06, + "loss": 0.0199, + "step": 184 + }, + { + "epoch": 2.9891891891891893, + "grad_norm": 0.504909098148346, + "learning_rate": 3.8425572212614155e-06, + "loss": 0.0211, + "step": 185 + }, + { + "epoch": 3.0, + "grad_norm": 1.010295033454895, + "learning_rate": 3.817556778933697e-06, + "loss": 0.0195, + "step": 186 + }, + { + "epoch": 3.016216216216216, + "grad_norm": 0.3988107442855835, + "learning_rate": 3.792372555541064e-06, + "loss": 0.0131, + "step": 187 + }, + { + "epoch": 3.0324324324324325, + "grad_norm": 0.31533047556877136, + "learning_rate": 3.7670080639366e-06, + "loss": 0.0107, + "step": 188 + }, + { + "epoch": 3.0486486486486486, + "grad_norm": 0.4819331765174866, + "learning_rate": 3.741466842118327e-06, + "loss": 0.0135, + "step": 189 + }, + { + "epoch": 3.064864864864865, + "grad_norm": 0.33277931809425354, + "learning_rate": 3.7157524527357036e-06, + "loss": 0.0112, + "step": 190 + }, + { + "epoch": 3.081081081081081, + "grad_norm": 0.3936960697174072, + "learning_rate": 3.6898684825926845e-06, + "loss": 0.0125, + "step": 191 + }, + { + "epoch": 3.097297297297297, + "grad_norm": 0.46424582600593567, + "learning_rate": 3.663818542147409e-06, + "loss": 0.0171, + "step": 192 + }, + { + "epoch": 3.1135135135135137, + "grad_norm": 0.4771481156349182, + "learning_rate": 3.6376062650085918e-06, + "loss": 0.0121, + "step": 193 + }, + { + "epoch": 3.1297297297297297, + "grad_norm": 0.4661300778388977, + "learning_rate": 3.61123530742869e-06, + "loss": 0.0164, + "step": 194 + }, + { + "epoch": 3.145945945945946, + "grad_norm": 0.424891859292984, + "learning_rate": 3.5847093477938955e-06, + "loss": 0.0116, + "step": 195 + }, + { + "epoch": 3.1621621621621623, + "grad_norm": 0.4026256501674652, + "learning_rate": 3.5580320861110627e-06, + "loss": 0.0135, + "step": 196 + }, + { + "epoch": 3.1783783783783783, + "grad_norm": 0.4946088492870331, + "learning_rate": 3.5312072434915983e-06, + "loss": 0.0104, + "step": 197 + }, + { + "epoch": 3.1945945945945944, + "grad_norm": 0.3920349180698395, + "learning_rate": 3.5042385616324243e-06, + "loss": 0.0127, + "step": 198 + }, + { + "epoch": 3.210810810810811, + "grad_norm": 0.35541996359825134, + "learning_rate": 3.477129802294057e-06, + "loss": 0.0095, + "step": 199 + }, + { + "epoch": 3.227027027027027, + "grad_norm": 0.4382397532463074, + "learning_rate": 3.4498847467759e-06, + "loss": 0.009, + "step": 200 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 0.4411066174507141, + "learning_rate": 3.4225071953887977e-06, + "loss": 0.0103, + "step": 201 + }, + { + "epoch": 3.2594594594594595, + "grad_norm": 0.4458081126213074, + "learning_rate": 3.3950009669249502e-06, + "loss": 0.011, + "step": 202 + }, + { + "epoch": 3.2756756756756755, + "grad_norm": 0.6307731866836548, + "learning_rate": 3.3673698981252385e-06, + "loss": 0.0123, + "step": 203 + }, + { + "epoch": 3.291891891891892, + "grad_norm": 0.48209428787231445, + "learning_rate": 3.3396178431440572e-06, + "loss": 0.0097, + "step": 204 + }, + { + "epoch": 3.308108108108108, + "grad_norm": 0.5341079831123352, + "learning_rate": 3.3117486730117092e-06, + "loss": 0.0098, + "step": 205 + }, + { + "epoch": 3.3243243243243246, + "grad_norm": 0.6222304701805115, + "learning_rate": 3.283766275094454e-06, + "loss": 0.0153, + "step": 206 + }, + { + "epoch": 3.3405405405405406, + "grad_norm": 0.6000131964683533, + "learning_rate": 3.255674552552267e-06, + "loss": 0.0124, + "step": 207 + }, + { + "epoch": 3.3567567567567567, + "grad_norm": 0.573101818561554, + "learning_rate": 3.227477423794412e-06, + "loss": 0.0111, + "step": 208 + }, + { + "epoch": 3.372972972972973, + "grad_norm": 0.49702250957489014, + "learning_rate": 3.1991788219328657e-06, + "loss": 0.0125, + "step": 209 + }, + { + "epoch": 3.389189189189189, + "grad_norm": 0.4762848913669586, + "learning_rate": 3.1707826942337124e-06, + "loss": 0.0089, + "step": 210 + }, + { + "epoch": 3.4054054054054053, + "grad_norm": 0.671346127986908, + "learning_rate": 3.142293001566548e-06, + "loss": 0.0128, + "step": 211 + }, + { + "epoch": 3.4216216216216218, + "grad_norm": 0.5226602554321289, + "learning_rate": 3.1137137178519983e-06, + "loss": 0.0119, + "step": 212 + }, + { + "epoch": 3.437837837837838, + "grad_norm": 0.40818020701408386, + "learning_rate": 3.085048829507406e-06, + "loss": 0.0097, + "step": 213 + }, + { + "epoch": 3.454054054054054, + "grad_norm": 0.5493360161781311, + "learning_rate": 3.056302334890786e-06, + "loss": 0.0117, + "step": 214 + }, + { + "epoch": 3.4702702702702704, + "grad_norm": 0.4277520477771759, + "learning_rate": 3.027478243743106e-06, + "loss": 0.0115, + "step": 215 + }, + { + "epoch": 3.4864864864864864, + "grad_norm": 0.5382378697395325, + "learning_rate": 2.9985805766289815e-06, + "loss": 0.0141, + "step": 216 + }, + { + "epoch": 3.5027027027027025, + "grad_norm": 0.686087429523468, + "learning_rate": 2.9696133643758663e-06, + "loss": 0.0106, + "step": 217 + }, + { + "epoch": 3.518918918918919, + "grad_norm": 0.4279845356941223, + "learning_rate": 2.940580647511805e-06, + "loss": 0.0101, + "step": 218 + }, + { + "epoch": 3.535135135135135, + "grad_norm": 0.3966323435306549, + "learning_rate": 2.911486475701835e-06, + "loss": 0.0111, + "step": 219 + }, + { + "epoch": 3.5513513513513515, + "grad_norm": 0.4444655776023865, + "learning_rate": 2.8823349071831154e-06, + "loss": 0.0122, + "step": 220 + }, + { + "epoch": 3.5675675675675675, + "grad_norm": 0.4062931537628174, + "learning_rate": 2.853130008198855e-06, + "loss": 0.0088, + "step": 221 + }, + { + "epoch": 3.583783783783784, + "grad_norm": 0.4915783107280731, + "learning_rate": 2.8238758524311316e-06, + "loss": 0.0106, + "step": 222 + }, + { + "epoch": 3.6, + "grad_norm": 0.5415279865264893, + "learning_rate": 2.7945765204326664e-06, + "loss": 0.0102, + "step": 223 + }, + { + "epoch": 3.616216216216216, + "grad_norm": 0.5782752633094788, + "learning_rate": 2.7652360990576457e-06, + "loss": 0.0113, + "step": 224 + }, + { + "epoch": 3.6324324324324326, + "grad_norm": 0.5013785362243652, + "learning_rate": 2.735858680891656e-06, + "loss": 0.0092, + "step": 225 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 0.553318440914154, + "learning_rate": 2.7064483636808314e-06, + "loss": 0.0119, + "step": 226 + }, + { + "epoch": 3.6648648648648647, + "grad_norm": 0.47181862592697144, + "learning_rate": 2.677009249760268e-06, + "loss": 0.0152, + "step": 227 + }, + { + "epoch": 3.6810810810810812, + "grad_norm": 0.5877431631088257, + "learning_rate": 2.6475454454818072e-06, + "loss": 0.0181, + "step": 228 + }, + { + "epoch": 3.6972972972972973, + "grad_norm": 0.6693160533905029, + "learning_rate": 2.6180610606412587e-06, + "loss": 0.0168, + "step": 229 + }, + { + "epoch": 3.7135135135135133, + "grad_norm": 0.6764176487922668, + "learning_rate": 2.5885602079051354e-06, + "loss": 0.0128, + "step": 230 + }, + { + "epoch": 3.72972972972973, + "grad_norm": 0.5279078483581543, + "learning_rate": 2.559047002236995e-06, + "loss": 0.0099, + "step": 231 + }, + { + "epoch": 3.745945945945946, + "grad_norm": 0.6159639954566956, + "learning_rate": 2.529525560323462e-06, + "loss": 0.0122, + "step": 232 + }, + { + "epoch": 3.762162162162162, + "grad_norm": 0.558202862739563, + "learning_rate": 2.5e-06, + "loss": 0.0118, + "step": 233 + }, + { + "epoch": 3.7783783783783784, + "grad_norm": 0.37727731466293335, + "learning_rate": 2.470474439676539e-06, + "loss": 0.0099, + "step": 234 + }, + { + "epoch": 3.7945945945945945, + "grad_norm": 0.4426223039627075, + "learning_rate": 2.4409529977630052e-06, + "loss": 0.0104, + "step": 235 + }, + { + "epoch": 3.810810810810811, + "grad_norm": 0.48488032817840576, + "learning_rate": 2.411439792094866e-06, + "loss": 0.0145, + "step": 236 + }, + { + "epoch": 3.827027027027027, + "grad_norm": 0.4551326632499695, + "learning_rate": 2.381938939358742e-06, + "loss": 0.0107, + "step": 237 + }, + { + "epoch": 3.8432432432432435, + "grad_norm": 0.738146185874939, + "learning_rate": 2.3524545545181936e-06, + "loss": 0.0132, + "step": 238 + }, + { + "epoch": 3.8594594594594596, + "grad_norm": 0.5243131518363953, + "learning_rate": 2.322990750239733e-06, + "loss": 0.0093, + "step": 239 + }, + { + "epoch": 3.8756756756756756, + "grad_norm": 0.4127775728702545, + "learning_rate": 2.2935516363191695e-06, + "loss": 0.0131, + "step": 240 + }, + { + "epoch": 3.891891891891892, + "grad_norm": 0.5398023128509521, + "learning_rate": 2.2641413191083445e-06, + "loss": 0.0122, + "step": 241 + }, + { + "epoch": 3.908108108108108, + "grad_norm": 0.45481953024864197, + "learning_rate": 2.234763900942355e-06, + "loss": 0.0106, + "step": 242 + }, + { + "epoch": 3.924324324324324, + "grad_norm": 0.5116259455680847, + "learning_rate": 2.2054234795673336e-06, + "loss": 0.017, + "step": 243 + }, + { + "epoch": 3.9405405405405407, + "grad_norm": 0.4914882779121399, + "learning_rate": 2.1761241475688697e-06, + "loss": 0.0108, + "step": 244 + } + ], + "logging_steps": 1, + "max_steps": 366, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 61, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.125638989643776e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-244/training_args.bin b/checkpoint-244/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5b28d107f55169977eced33ac6929abb398bb2c5 --- /dev/null +++ b/checkpoint-244/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e2f1aaf0f48ae52048eea3703205522237e597bd418f53d57d152ef3ad9cbbc +size 8056 diff --git a/checkpoint-244/zero_to_fp32.py b/checkpoint-244/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-244/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-305/README.md b/checkpoint-305/README.md new file mode 100644 index 0000000000000000000000000000000000000000..be5c87703f12b547886cc6a2ecfbe9ee150496fa --- /dev/null +++ b/checkpoint-305/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-305/adapter_config.json b/checkpoint-305/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aaa71b6240dcb4147fb982eb2f0ff89574c4fb31 --- /dev/null +++ b/checkpoint-305/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "up_proj", + "gate_proj", + "v_proj", + "down_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-305/adapter_model.safetensors b/checkpoint-305/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0d39b384938424edefd0d5e70d26dd0e4d6ebba9 --- /dev/null +++ b/checkpoint-305/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fdbe6c42c89a54638ef3b592204ebb1c211f91400154c3e7ece073890dcad21 +size 3443586272 diff --git a/checkpoint-305/global_step303/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-305/global_step303/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..973046b377967e6aa988f7dc838341470d4df337 --- /dev/null +++ b/checkpoint-305/global_step303/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b089751069702b717b98745bf07581084964a53ded80bd4865baf78a99a6b9f5 +size 20661195036 diff --git a/checkpoint-305/global_step303/mp_rank_00_model_states.pt b/checkpoint-305/global_step303/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c7699f44b235f0b982889cceb25f446e765880c --- /dev/null +++ b/checkpoint-305/global_step303/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:303bbca07a6111e21b474371585de6e97e927c1339a48573119eb70f517237e2 +size 3555326649 diff --git a/checkpoint-305/latest b/checkpoint-305/latest new file mode 100644 index 0000000000000000000000000000000000000000..4899493db914d3705774c031343988db41478e45 --- /dev/null +++ b/checkpoint-305/latest @@ -0,0 +1 @@ +global_step303 \ No newline at end of file diff --git a/checkpoint-305/rng_state.pth b/checkpoint-305/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2965f4db4fc76ebc1e5aa7593a1ea8bcb9e95146 --- /dev/null +++ b/checkpoint-305/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79d546f5ff0177c685ad40e8587af458c727fcd376c89c674b6332d2cacceb8d +size 14244 diff --git a/checkpoint-305/scheduler.pt b/checkpoint-305/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb55b8722555fc58297e301ccfe3d7333197201d --- /dev/null +++ b/checkpoint-305/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69f200f4116ab889ce86a855037bd6927537569b4461543092ad34c77a5162c1 +size 1064 diff --git a/checkpoint-305/special_tokens_map.json b/checkpoint-305/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-305/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-305/tokenizer.json b/checkpoint-305/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-305/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-305/tokenizer_config.json b/checkpoint-305/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-305/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-305/trainer_state.json b/checkpoint-305/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bdd5a57d7e33dd171eec88fb0540ffe11666a2ee --- /dev/null +++ b/checkpoint-305/trainer_state.json @@ -0,0 +1,2168 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.924324324324324, + "eval_steps": 500, + "global_step": 305, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016216216216216217, + "grad_norm": 39.12052917480469, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2957, + "step": 1 + }, + { + "epoch": 0.032432432432432434, + "grad_norm": 38.9581413269043, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.2959, + "step": 2 + }, + { + "epoch": 0.04864864864864865, + "grad_norm": 39.2702751159668, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.2677, + "step": 3 + }, + { + "epoch": 0.06486486486486487, + "grad_norm": 39.18815231323242, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.2936, + "step": 4 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 38.66701889038086, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.2561, + "step": 5 + }, + { + "epoch": 0.0972972972972973, + "grad_norm": 39.53536605834961, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.2579, + "step": 6 + }, + { + "epoch": 0.11351351351351352, + "grad_norm": 39.3793830871582, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.2627, + "step": 7 + }, + { + "epoch": 0.12972972972972974, + "grad_norm": 39.88922119140625, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.2729, + "step": 8 + }, + { + "epoch": 0.14594594594594595, + "grad_norm": 37.9880256652832, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.2311, + "step": 9 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 37.024139404296875, + "learning_rate": 5.000000000000001e-07, + "loss": 2.1773, + "step": 10 + }, + { + "epoch": 0.1783783783783784, + "grad_norm": 36.89325714111328, + "learning_rate": 5.5e-07, + "loss": 2.1927, + "step": 11 + }, + { + "epoch": 0.1945945945945946, + "grad_norm": 37.244178771972656, + "learning_rate": 6.000000000000001e-07, + "loss": 2.1757, + "step": 12 + }, + { + "epoch": 0.21081081081081082, + "grad_norm": 34.77650451660156, + "learning_rate": 6.5e-07, + "loss": 2.0392, + "step": 13 + }, + { + "epoch": 0.22702702702702704, + "grad_norm": 34.78818893432617, + "learning_rate": 7.000000000000001e-07, + "loss": 1.9996, + "step": 14 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 34.86852264404297, + "learning_rate": 7.5e-07, + "loss": 1.9496, + "step": 15 + }, + { + "epoch": 0.2594594594594595, + "grad_norm": 35.202796936035156, + "learning_rate": 8.000000000000001e-07, + "loss": 1.8542, + "step": 16 + }, + { + "epoch": 0.2756756756756757, + "grad_norm": 34.11354064941406, + "learning_rate": 8.500000000000001e-07, + "loss": 1.7118, + "step": 17 + }, + { + "epoch": 0.2918918918918919, + "grad_norm": 36.309059143066406, + "learning_rate": 9.000000000000001e-07, + "loss": 1.6834, + "step": 18 + }, + { + "epoch": 0.3081081081081081, + "grad_norm": 34.69994354248047, + "learning_rate": 9.500000000000001e-07, + "loss": 1.5298, + "step": 19 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 35.43153381347656, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.4191, + "step": 20 + }, + { + "epoch": 0.34054054054054056, + "grad_norm": 33.53745651245117, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.3068, + "step": 21 + }, + { + "epoch": 0.3567567567567568, + "grad_norm": 33.775604248046875, + "learning_rate": 1.1e-06, + "loss": 1.224, + "step": 22 + }, + { + "epoch": 0.372972972972973, + "grad_norm": 30.57005500793457, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.0704, + "step": 23 + }, + { + "epoch": 0.3891891891891892, + "grad_norm": 27.964860916137695, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.9548, + "step": 24 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 26.023576736450195, + "learning_rate": 1.25e-06, + "loss": 0.8503, + "step": 25 + }, + { + "epoch": 0.42162162162162165, + "grad_norm": 25.0452938079834, + "learning_rate": 1.3e-06, + "loss": 0.6938, + "step": 26 + }, + { + "epoch": 0.43783783783783786, + "grad_norm": 24.663373947143555, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.5648, + "step": 27 + }, + { + "epoch": 0.4540540540540541, + "grad_norm": 21.61736488342285, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.435, + "step": 28 + }, + { + "epoch": 0.4702702702702703, + "grad_norm": 18.3259334564209, + "learning_rate": 1.45e-06, + "loss": 0.3322, + "step": 29 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 16.80081558227539, + "learning_rate": 1.5e-06, + "loss": 0.2625, + "step": 30 + }, + { + "epoch": 0.5027027027027027, + "grad_norm": 14.789258003234863, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1757, + "step": 31 + }, + { + "epoch": 0.518918918918919, + "grad_norm": 10.406538963317871, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1376, + "step": 32 + }, + { + "epoch": 0.5351351351351351, + "grad_norm": 4.868802547454834, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0815, + "step": 33 + }, + { + "epoch": 0.5513513513513514, + "grad_norm": 1.8639686107635498, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0628, + "step": 34 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 1.897918462753296, + "learning_rate": 1.75e-06, + "loss": 0.0775, + "step": 35 + }, + { + "epoch": 0.5837837837837838, + "grad_norm": 1.296712040901184, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0565, + "step": 36 + }, + { + "epoch": 0.6, + "grad_norm": 1.0163214206695557, + "learning_rate": 1.85e-06, + "loss": 0.0544, + "step": 37 + }, + { + "epoch": 0.6162162162162163, + "grad_norm": 1.070162296295166, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0621, + "step": 38 + }, + { + "epoch": 0.6324324324324324, + "grad_norm": 1.024267315864563, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0566, + "step": 39 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.9016611576080322, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0511, + "step": 40 + }, + { + "epoch": 0.6648648648648648, + "grad_norm": 0.8272562623023987, + "learning_rate": 2.05e-06, + "loss": 0.0533, + "step": 41 + }, + { + "epoch": 0.6810810810810811, + "grad_norm": 0.8875278234481812, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0495, + "step": 42 + }, + { + "epoch": 0.6972972972972973, + "grad_norm": 0.8804877996444702, + "learning_rate": 2.15e-06, + "loss": 0.0506, + "step": 43 + }, + { + "epoch": 0.7135135135135136, + "grad_norm": 0.7133358120918274, + "learning_rate": 2.2e-06, + "loss": 0.0467, + "step": 44 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 0.8142214417457581, + "learning_rate": 2.25e-06, + "loss": 0.0552, + "step": 45 + }, + { + "epoch": 0.745945945945946, + "grad_norm": 0.8341564536094666, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0574, + "step": 46 + }, + { + "epoch": 0.7621621621621621, + "grad_norm": 0.6500507593154907, + "learning_rate": 2.35e-06, + "loss": 0.0398, + "step": 47 + }, + { + "epoch": 0.7783783783783784, + "grad_norm": 0.6163598895072937, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0459, + "step": 48 + }, + { + "epoch": 0.7945945945945946, + "grad_norm": 0.663949191570282, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.046, + "step": 49 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7521553635597229, + "learning_rate": 2.5e-06, + "loss": 0.0525, + "step": 50 + }, + { + "epoch": 0.827027027027027, + "grad_norm": 0.7828383445739746, + "learning_rate": 2.55e-06, + "loss": 0.0558, + "step": 51 + }, + { + "epoch": 0.8432432432432433, + "grad_norm": 0.7935078740119934, + "learning_rate": 2.6e-06, + "loss": 0.0451, + "step": 52 + }, + { + "epoch": 0.8594594594594595, + "grad_norm": 0.6327880620956421, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0403, + "step": 53 + }, + { + "epoch": 0.8756756756756757, + "grad_norm": 0.6185981035232544, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0406, + "step": 54 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 0.5417979955673218, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0426, + "step": 55 + }, + { + "epoch": 0.9081081081081082, + "grad_norm": 0.7140630483627319, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0446, + "step": 56 + }, + { + "epoch": 0.9243243243243243, + "grad_norm": 0.7191944122314453, + "learning_rate": 2.85e-06, + "loss": 0.047, + "step": 57 + }, + { + "epoch": 0.9405405405405406, + "grad_norm": 0.7562940716743469, + "learning_rate": 2.9e-06, + "loss": 0.0476, + "step": 58 + }, + { + "epoch": 0.9567567567567568, + "grad_norm": 0.7422239184379578, + "learning_rate": 2.95e-06, + "loss": 0.0462, + "step": 59 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 0.677144467830658, + "learning_rate": 3e-06, + "loss": 0.0475, + "step": 60 + }, + { + "epoch": 0.9891891891891892, + "grad_norm": 0.6127192974090576, + "learning_rate": 3.05e-06, + "loss": 0.0434, + "step": 61 + }, + { + "epoch": 1.0, + "grad_norm": 0.6127192974090576, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0375, + "step": 62 + }, + { + "epoch": 1.0162162162162163, + "grad_norm": 0.959559440612793, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0421, + "step": 63 + }, + { + "epoch": 1.0324324324324325, + "grad_norm": 0.6539880037307739, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0414, + "step": 64 + }, + { + "epoch": 1.0486486486486486, + "grad_norm": 0.5929313898086548, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0451, + "step": 65 + }, + { + "epoch": 1.0648648648648649, + "grad_norm": 0.6479571461677551, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0415, + "step": 66 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.5496926307678223, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0366, + "step": 67 + }, + { + "epoch": 1.0972972972972972, + "grad_norm": 0.5373682379722595, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0383, + "step": 68 + }, + { + "epoch": 1.1135135135135135, + "grad_norm": 0.5489712357521057, + "learning_rate": 3.45e-06, + "loss": 0.0427, + "step": 69 + }, + { + "epoch": 1.1297297297297297, + "grad_norm": 0.6830047369003296, + "learning_rate": 3.5e-06, + "loss": 0.039, + "step": 70 + }, + { + "epoch": 1.145945945945946, + "grad_norm": 0.5794199705123901, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0409, + "step": 71 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 0.571513831615448, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0392, + "step": 72 + }, + { + "epoch": 1.1783783783783783, + "grad_norm": 0.7753933668136597, + "learning_rate": 3.65e-06, + "loss": 0.0365, + "step": 73 + }, + { + "epoch": 1.1945945945945946, + "grad_norm": 0.6135310530662537, + "learning_rate": 3.7e-06, + "loss": 0.036, + "step": 74 + }, + { + "epoch": 1.2108108108108109, + "grad_norm": 0.5497344136238098, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.035, + "step": 75 + }, + { + "epoch": 1.227027027027027, + "grad_norm": 0.5861782431602478, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0434, + "step": 76 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 0.6941010355949402, + "learning_rate": 3.85e-06, + "loss": 0.0336, + "step": 77 + }, + { + "epoch": 1.2594594594594595, + "grad_norm": 0.5305830240249634, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0391, + "step": 78 + }, + { + "epoch": 1.2756756756756757, + "grad_norm": 0.6456385254859924, + "learning_rate": 3.95e-06, + "loss": 0.0422, + "step": 79 + }, + { + "epoch": 1.291891891891892, + "grad_norm": 0.5704363584518433, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0342, + "step": 80 + }, + { + "epoch": 1.308108108108108, + "grad_norm": 0.5257390141487122, + "learning_rate": 4.05e-06, + "loss": 0.0369, + "step": 81 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 0.5541989207267761, + "learning_rate": 4.1e-06, + "loss": 0.0331, + "step": 82 + }, + { + "epoch": 1.3405405405405406, + "grad_norm": 0.7190688252449036, + "learning_rate": 4.15e-06, + "loss": 0.039, + "step": 83 + }, + { + "epoch": 1.3567567567567567, + "grad_norm": 0.4766721725463867, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0354, + "step": 84 + }, + { + "epoch": 1.372972972972973, + "grad_norm": 0.5847981572151184, + "learning_rate": 4.25e-06, + "loss": 0.0355, + "step": 85 + }, + { + "epoch": 1.3891891891891892, + "grad_norm": 0.6361181139945984, + "learning_rate": 4.3e-06, + "loss": 0.0415, + "step": 86 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 0.6437036395072937, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0353, + "step": 87 + }, + { + "epoch": 1.4216216216216218, + "grad_norm": 0.712043046951294, + "learning_rate": 4.4e-06, + "loss": 0.0311, + "step": 88 + }, + { + "epoch": 1.4378378378378378, + "grad_norm": 0.5829771757125854, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0433, + "step": 89 + }, + { + "epoch": 1.454054054054054, + "grad_norm": 0.6977937817573547, + "learning_rate": 4.5e-06, + "loss": 0.0391, + "step": 90 + }, + { + "epoch": 1.4702702702702704, + "grad_norm": 0.49931228160858154, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0352, + "step": 91 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.5281490683555603, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0385, + "step": 92 + }, + { + "epoch": 1.5027027027027027, + "grad_norm": 0.613349974155426, + "learning_rate": 4.65e-06, + "loss": 0.0399, + "step": 93 + }, + { + "epoch": 1.518918918918919, + "grad_norm": 0.6584879755973816, + "learning_rate": 4.7e-06, + "loss": 0.043, + "step": 94 + }, + { + "epoch": 1.535135135135135, + "grad_norm": 0.6006895303726196, + "learning_rate": 4.75e-06, + "loss": 0.0372, + "step": 95 + }, + { + "epoch": 1.5513513513513515, + "grad_norm": 0.5364943146705627, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0384, + "step": 96 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 0.4963968098163605, + "learning_rate": 4.85e-06, + "loss": 0.0324, + "step": 97 + }, + { + "epoch": 1.5837837837837838, + "grad_norm": 0.5868538618087769, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0386, + "step": 98 + }, + { + "epoch": 1.6, + "grad_norm": 0.6690974235534668, + "learning_rate": 4.95e-06, + "loss": 0.0332, + "step": 99 + }, + { + "epoch": 1.6162162162162161, + "grad_norm": 0.6118388175964355, + "learning_rate": 5e-06, + "loss": 0.0398, + "step": 100 + }, + { + "epoch": 1.6324324324324324, + "grad_norm": 0.6872304677963257, + "learning_rate": 4.999825642177387e-06, + "loss": 0.0333, + "step": 101 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 0.6457200646400452, + "learning_rate": 4.999302593030069e-06, + "loss": 0.0381, + "step": 102 + }, + { + "epoch": 1.6648648648648647, + "grad_norm": 0.6096416115760803, + "learning_rate": 4.998430925516213e-06, + "loss": 0.0385, + "step": 103 + }, + { + "epoch": 1.6810810810810812, + "grad_norm": 0.582796573638916, + "learning_rate": 4.99721076122146e-06, + "loss": 0.0317, + "step": 104 + }, + { + "epoch": 1.6972972972972973, + "grad_norm": 0.5576394200325012, + "learning_rate": 4.995642270341961e-06, + "loss": 0.0378, + "step": 105 + }, + { + "epoch": 1.7135135135135136, + "grad_norm": 0.7414760589599609, + "learning_rate": 4.99372567166064e-06, + "loss": 0.0403, + "step": 106 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 0.6029103994369507, + "learning_rate": 4.991461232516675e-06, + "loss": 0.0418, + "step": 107 + }, + { + "epoch": 1.7459459459459459, + "grad_norm": 0.771609365940094, + "learning_rate": 4.98884926876821e-06, + "loss": 0.0413, + "step": 108 + }, + { + "epoch": 1.7621621621621621, + "grad_norm": 0.6869891285896301, + "learning_rate": 4.9858901447482924e-06, + "loss": 0.0367, + "step": 109 + }, + { + "epoch": 1.7783783783783784, + "grad_norm": 0.4931647479534149, + "learning_rate": 4.982584273214061e-06, + "loss": 0.033, + "step": 110 + }, + { + "epoch": 1.7945945945945945, + "grad_norm": 0.5160052180290222, + "learning_rate": 4.978932115289165e-06, + "loss": 0.0357, + "step": 111 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 0.49750861525535583, + "learning_rate": 4.974934180399447e-06, + "loss": 0.0333, + "step": 112 + }, + { + "epoch": 1.827027027027027, + "grad_norm": 0.6596441864967346, + "learning_rate": 4.970591026201884e-06, + "loss": 0.0354, + "step": 113 + }, + { + "epoch": 1.8432432432432433, + "grad_norm": 0.6613579988479614, + "learning_rate": 4.965903258506806e-06, + "loss": 0.0377, + "step": 114 + }, + { + "epoch": 1.8594594594594596, + "grad_norm": 0.5383866429328918, + "learning_rate": 4.9608715311933865e-06, + "loss": 0.0418, + "step": 115 + }, + { + "epoch": 1.8756756756756756, + "grad_norm": 0.6303413510322571, + "learning_rate": 4.955496546118439e-06, + "loss": 0.0351, + "step": 116 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.5293605923652649, + "learning_rate": 4.949779053018519e-06, + "loss": 0.0322, + "step": 117 + }, + { + "epoch": 1.9081081081081082, + "grad_norm": 0.5211143493652344, + "learning_rate": 4.943719849405347e-06, + "loss": 0.0374, + "step": 118 + }, + { + "epoch": 1.9243243243243242, + "grad_norm": 0.5933778882026672, + "learning_rate": 4.937319780454559e-06, + "loss": 0.0377, + "step": 119 + }, + { + "epoch": 1.9405405405405407, + "grad_norm": 0.6020687818527222, + "learning_rate": 4.930579738887827e-06, + "loss": 0.0313, + "step": 120 + }, + { + "epoch": 1.9567567567567568, + "grad_norm": 0.7828154563903809, + "learning_rate": 4.923500664848327e-06, + "loss": 0.0372, + "step": 121 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 0.6172424554824829, + "learning_rate": 4.9160835457696075e-06, + "loss": 0.0387, + "step": 122 + }, + { + "epoch": 1.9891891891891893, + "grad_norm": 0.5671921372413635, + "learning_rate": 4.9083294162378545e-06, + "loss": 0.0346, + "step": 123 + }, + { + "epoch": 2.0, + "grad_norm": 1.0704405307769775, + "learning_rate": 4.900239357847582e-06, + "loss": 0.0298, + "step": 124 + }, + { + "epoch": 2.016216216216216, + "grad_norm": 0.5932011604309082, + "learning_rate": 4.891814499050762e-06, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 2.0324324324324325, + "grad_norm": 0.47397834062576294, + "learning_rate": 4.883056014999423e-06, + "loss": 0.0281, + "step": 126 + }, + { + "epoch": 2.0486486486486486, + "grad_norm": 0.538270115852356, + "learning_rate": 4.873965127381734e-06, + "loss": 0.0268, + "step": 127 + }, + { + "epoch": 2.064864864864865, + "grad_norm": 0.3924686908721924, + "learning_rate": 4.864543104251587e-06, + "loss": 0.02, + "step": 128 + }, + { + "epoch": 2.081081081081081, + "grad_norm": 0.5162842869758606, + "learning_rate": 4.854791259851735e-06, + "loss": 0.0237, + "step": 129 + }, + { + "epoch": 2.097297297297297, + "grad_norm": 0.4691126048564911, + "learning_rate": 4.844710954430464e-06, + "loss": 0.0224, + "step": 130 + }, + { + "epoch": 2.1135135135135137, + "grad_norm": 0.47650063037872314, + "learning_rate": 4.834303594051854e-06, + "loss": 0.0202, + "step": 131 + }, + { + "epoch": 2.1297297297297297, + "grad_norm": 0.5041627883911133, + "learning_rate": 4.823570630399665e-06, + "loss": 0.0228, + "step": 132 + }, + { + "epoch": 2.145945945945946, + "grad_norm": 0.5368483662605286, + "learning_rate": 4.812513560574832e-06, + "loss": 0.0241, + "step": 133 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.7245975732803345, + "learning_rate": 4.8011339268866505e-06, + "loss": 0.0294, + "step": 134 + }, + { + "epoch": 2.1783783783783783, + "grad_norm": 0.5506283044815063, + "learning_rate": 4.789433316637644e-06, + "loss": 0.0204, + "step": 135 + }, + { + "epoch": 2.1945945945945944, + "grad_norm": 0.5240617990493774, + "learning_rate": 4.777413361902152e-06, + "loss": 0.0227, + "step": 136 + }, + { + "epoch": 2.210810810810811, + "grad_norm": 0.6164438128471375, + "learning_rate": 4.765075739298683e-06, + "loss": 0.0209, + "step": 137 + }, + { + "epoch": 2.227027027027027, + "grad_norm": 0.551898181438446, + "learning_rate": 4.752422169756048e-06, + "loss": 0.0187, + "step": 138 + }, + { + "epoch": 2.2432432432432434, + "grad_norm": 0.45092299580574036, + "learning_rate": 4.739454418273314e-06, + "loss": 0.0281, + "step": 139 + }, + { + "epoch": 2.2594594594594595, + "grad_norm": 0.48173126578330994, + "learning_rate": 4.726174293673612e-06, + "loss": 0.0213, + "step": 140 + }, + { + "epoch": 2.2756756756756755, + "grad_norm": 0.48536229133605957, + "learning_rate": 4.712583648351827e-06, + "loss": 0.0204, + "step": 141 + }, + { + "epoch": 2.291891891891892, + "grad_norm": 0.4885499179363251, + "learning_rate": 4.698684378016223e-06, + "loss": 0.0225, + "step": 142 + }, + { + "epoch": 2.308108108108108, + "grad_norm": 0.40719687938690186, + "learning_rate": 4.684478421424007e-06, + "loss": 0.0205, + "step": 143 + }, + { + "epoch": 2.3243243243243246, + "grad_norm": 0.4365272521972656, + "learning_rate": 4.669967760110908e-06, + "loss": 0.0224, + "step": 144 + }, + { + "epoch": 2.3405405405405406, + "grad_norm": 0.4639301300048828, + "learning_rate": 4.655154418114774e-06, + "loss": 0.0256, + "step": 145 + }, + { + "epoch": 2.3567567567567567, + "grad_norm": 0.47420835494995117, + "learning_rate": 4.6400404616932505e-06, + "loss": 0.0208, + "step": 146 + }, + { + "epoch": 2.372972972972973, + "grad_norm": 0.5030474066734314, + "learning_rate": 4.624627999035564e-06, + "loss": 0.0255, + "step": 147 + }, + { + "epoch": 2.389189189189189, + "grad_norm": 0.47888803482055664, + "learning_rate": 4.608919179968457e-06, + "loss": 0.0241, + "step": 148 + }, + { + "epoch": 2.4054054054054053, + "grad_norm": 0.602581262588501, + "learning_rate": 4.592916195656322e-06, + "loss": 0.0243, + "step": 149 + }, + { + "epoch": 2.4216216216216218, + "grad_norm": 0.6816417574882507, + "learning_rate": 4.576621278295558e-06, + "loss": 0.0259, + "step": 150 + }, + { + "epoch": 2.437837837837838, + "grad_norm": 0.6839447617530823, + "learning_rate": 4.5600367008032135e-06, + "loss": 0.0247, + "step": 151 + }, + { + "epoch": 2.454054054054054, + "grad_norm": 0.496794193983078, + "learning_rate": 4.543164776499945e-06, + "loss": 0.0244, + "step": 152 + }, + { + "epoch": 2.4702702702702704, + "grad_norm": 0.4372956156730652, + "learning_rate": 4.5260078587873416e-06, + "loss": 0.0208, + "step": 153 + }, + { + "epoch": 2.4864864864864864, + "grad_norm": 0.6199434399604797, + "learning_rate": 4.508568340819654e-06, + "loss": 0.028, + "step": 154 + }, + { + "epoch": 2.5027027027027025, + "grad_norm": 0.6074104905128479, + "learning_rate": 4.490848655169986e-06, + "loss": 0.0278, + "step": 155 + }, + { + "epoch": 2.518918918918919, + "grad_norm": 0.5419324636459351, + "learning_rate": 4.472851273490985e-06, + "loss": 0.0181, + "step": 156 + }, + { + "epoch": 2.535135135135135, + "grad_norm": 0.4877943992614746, + "learning_rate": 4.454578706170075e-06, + "loss": 0.0214, + "step": 157 + }, + { + "epoch": 2.5513513513513515, + "grad_norm": 0.5049244165420532, + "learning_rate": 4.436033501979299e-06, + "loss": 0.0214, + "step": 158 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.45928511023521423, + "learning_rate": 4.417218247719794e-06, + "loss": 0.0167, + "step": 159 + }, + { + "epoch": 2.583783783783784, + "grad_norm": 0.5185860395431519, + "learning_rate": 4.398135567860972e-06, + "loss": 0.0243, + "step": 160 + }, + { + "epoch": 2.6, + "grad_norm": 0.3984812796115875, + "learning_rate": 4.378788124174441e-06, + "loss": 0.0201, + "step": 161 + }, + { + "epoch": 2.616216216216216, + "grad_norm": 0.607692301273346, + "learning_rate": 4.359178615362725e-06, + "loss": 0.0247, + "step": 162 + }, + { + "epoch": 2.6324324324324326, + "grad_norm": 0.5436367988586426, + "learning_rate": 4.33930977668283e-06, + "loss": 0.0204, + "step": 163 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 0.6367728114128113, + "learning_rate": 4.319184379564716e-06, + "loss": 0.0222, + "step": 164 + }, + { + "epoch": 2.6648648648648647, + "grad_norm": 0.5538708567619324, + "learning_rate": 4.298805231224721e-06, + "loss": 0.0215, + "step": 165 + }, + { + "epoch": 2.6810810810810812, + "grad_norm": 0.5421778559684753, + "learning_rate": 4.278175174273989e-06, + "loss": 0.0201, + "step": 166 + }, + { + "epoch": 2.6972972972972973, + "grad_norm": 0.6123104691505432, + "learning_rate": 4.257297086321967e-06, + "loss": 0.0209, + "step": 167 + }, + { + "epoch": 2.7135135135135133, + "grad_norm": 0.6386194229125977, + "learning_rate": 4.236173879575022e-06, + "loss": 0.0214, + "step": 168 + }, + { + "epoch": 2.72972972972973, + "grad_norm": 0.6019391417503357, + "learning_rate": 4.2148085004302205e-06, + "loss": 0.0246, + "step": 169 + }, + { + "epoch": 2.745945945945946, + "grad_norm": 0.5638225674629211, + "learning_rate": 4.1932039290643534e-06, + "loss": 0.0189, + "step": 170 + }, + { + "epoch": 2.762162162162162, + "grad_norm": 0.6640142202377319, + "learning_rate": 4.1713631790182366e-06, + "loss": 0.0236, + "step": 171 + }, + { + "epoch": 2.7783783783783784, + "grad_norm": 0.5170625448226929, + "learning_rate": 4.149289296776369e-06, + "loss": 0.0203, + "step": 172 + }, + { + "epoch": 2.7945945945945945, + "grad_norm": 0.5130777955055237, + "learning_rate": 4.126985361341984e-06, + "loss": 0.0195, + "step": 173 + }, + { + "epoch": 2.810810810810811, + "grad_norm": 0.5125660300254822, + "learning_rate": 4.104454483807579e-06, + "loss": 0.0229, + "step": 174 + }, + { + "epoch": 2.827027027027027, + "grad_norm": 0.573662281036377, + "learning_rate": 4.0816998069209516e-06, + "loss": 0.0202, + "step": 175 + }, + { + "epoch": 2.8432432432432435, + "grad_norm": 0.6013869643211365, + "learning_rate": 4.058724504646834e-06, + "loss": 0.0319, + "step": 176 + }, + { + "epoch": 2.8594594594594596, + "grad_norm": 0.5050269365310669, + "learning_rate": 4.0355317817241705e-06, + "loss": 0.0189, + "step": 177 + }, + { + "epoch": 2.8756756756756756, + "grad_norm": 0.5249143838882446, + "learning_rate": 4.012124873219094e-06, + "loss": 0.0214, + "step": 178 + }, + { + "epoch": 2.891891891891892, + "grad_norm": 0.5124053955078125, + "learning_rate": 3.988507044073687e-06, + "loss": 0.0162, + "step": 179 + }, + { + "epoch": 2.908108108108108, + "grad_norm": 0.4640377461910248, + "learning_rate": 3.964681588650562e-06, + "loss": 0.0189, + "step": 180 + }, + { + "epoch": 2.924324324324324, + "grad_norm": 0.6197820901870728, + "learning_rate": 3.940651830273342e-06, + "loss": 0.0237, + "step": 181 + }, + { + "epoch": 2.9405405405405407, + "grad_norm": 0.6041496992111206, + "learning_rate": 3.916421120763106e-06, + "loss": 0.0241, + "step": 182 + }, + { + "epoch": 2.9567567567567568, + "grad_norm": 0.5259250402450562, + "learning_rate": 3.891992839970855e-06, + "loss": 0.0207, + "step": 183 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.6110473871231079, + "learning_rate": 3.8673703953060685e-06, + "loss": 0.0199, + "step": 184 + }, + { + "epoch": 2.9891891891891893, + "grad_norm": 0.504909098148346, + "learning_rate": 3.8425572212614155e-06, + "loss": 0.0211, + "step": 185 + }, + { + "epoch": 3.0, + "grad_norm": 1.010295033454895, + "learning_rate": 3.817556778933697e-06, + "loss": 0.0195, + "step": 186 + }, + { + "epoch": 3.016216216216216, + "grad_norm": 0.3988107442855835, + "learning_rate": 3.792372555541064e-06, + "loss": 0.0131, + "step": 187 + }, + { + "epoch": 3.0324324324324325, + "grad_norm": 0.31533047556877136, + "learning_rate": 3.7670080639366e-06, + "loss": 0.0107, + "step": 188 + }, + { + "epoch": 3.0486486486486486, + "grad_norm": 0.4819331765174866, + "learning_rate": 3.741466842118327e-06, + "loss": 0.0135, + "step": 189 + }, + { + "epoch": 3.064864864864865, + "grad_norm": 0.33277931809425354, + "learning_rate": 3.7157524527357036e-06, + "loss": 0.0112, + "step": 190 + }, + { + "epoch": 3.081081081081081, + "grad_norm": 0.3936960697174072, + "learning_rate": 3.6898684825926845e-06, + "loss": 0.0125, + "step": 191 + }, + { + "epoch": 3.097297297297297, + "grad_norm": 0.46424582600593567, + "learning_rate": 3.663818542147409e-06, + "loss": 0.0171, + "step": 192 + }, + { + "epoch": 3.1135135135135137, + "grad_norm": 0.4771481156349182, + "learning_rate": 3.6376062650085918e-06, + "loss": 0.0121, + "step": 193 + }, + { + "epoch": 3.1297297297297297, + "grad_norm": 0.4661300778388977, + "learning_rate": 3.61123530742869e-06, + "loss": 0.0164, + "step": 194 + }, + { + "epoch": 3.145945945945946, + "grad_norm": 0.424891859292984, + "learning_rate": 3.5847093477938955e-06, + "loss": 0.0116, + "step": 195 + }, + { + "epoch": 3.1621621621621623, + "grad_norm": 0.4026256501674652, + "learning_rate": 3.5580320861110627e-06, + "loss": 0.0135, + "step": 196 + }, + { + "epoch": 3.1783783783783783, + "grad_norm": 0.4946088492870331, + "learning_rate": 3.5312072434915983e-06, + "loss": 0.0104, + "step": 197 + }, + { + "epoch": 3.1945945945945944, + "grad_norm": 0.3920349180698395, + "learning_rate": 3.5042385616324243e-06, + "loss": 0.0127, + "step": 198 + }, + { + "epoch": 3.210810810810811, + "grad_norm": 0.35541996359825134, + "learning_rate": 3.477129802294057e-06, + "loss": 0.0095, + "step": 199 + }, + { + "epoch": 3.227027027027027, + "grad_norm": 0.4382397532463074, + "learning_rate": 3.4498847467759e-06, + "loss": 0.009, + "step": 200 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 0.4411066174507141, + "learning_rate": 3.4225071953887977e-06, + "loss": 0.0103, + "step": 201 + }, + { + "epoch": 3.2594594594594595, + "grad_norm": 0.4458081126213074, + "learning_rate": 3.3950009669249502e-06, + "loss": 0.011, + "step": 202 + }, + { + "epoch": 3.2756756756756755, + "grad_norm": 0.6307731866836548, + "learning_rate": 3.3673698981252385e-06, + "loss": 0.0123, + "step": 203 + }, + { + "epoch": 3.291891891891892, + "grad_norm": 0.48209428787231445, + "learning_rate": 3.3396178431440572e-06, + "loss": 0.0097, + "step": 204 + }, + { + "epoch": 3.308108108108108, + "grad_norm": 0.5341079831123352, + "learning_rate": 3.3117486730117092e-06, + "loss": 0.0098, + "step": 205 + }, + { + "epoch": 3.3243243243243246, + "grad_norm": 0.6222304701805115, + "learning_rate": 3.283766275094454e-06, + "loss": 0.0153, + "step": 206 + }, + { + "epoch": 3.3405405405405406, + "grad_norm": 0.6000131964683533, + "learning_rate": 3.255674552552267e-06, + "loss": 0.0124, + "step": 207 + }, + { + "epoch": 3.3567567567567567, + "grad_norm": 0.573101818561554, + "learning_rate": 3.227477423794412e-06, + "loss": 0.0111, + "step": 208 + }, + { + "epoch": 3.372972972972973, + "grad_norm": 0.49702250957489014, + "learning_rate": 3.1991788219328657e-06, + "loss": 0.0125, + "step": 209 + }, + { + "epoch": 3.389189189189189, + "grad_norm": 0.4762848913669586, + "learning_rate": 3.1707826942337124e-06, + "loss": 0.0089, + "step": 210 + }, + { + "epoch": 3.4054054054054053, + "grad_norm": 0.671346127986908, + "learning_rate": 3.142293001566548e-06, + "loss": 0.0128, + "step": 211 + }, + { + "epoch": 3.4216216216216218, + "grad_norm": 0.5226602554321289, + "learning_rate": 3.1137137178519983e-06, + "loss": 0.0119, + "step": 212 + }, + { + "epoch": 3.437837837837838, + "grad_norm": 0.40818020701408386, + "learning_rate": 3.085048829507406e-06, + "loss": 0.0097, + "step": 213 + }, + { + "epoch": 3.454054054054054, + "grad_norm": 0.5493360161781311, + "learning_rate": 3.056302334890786e-06, + "loss": 0.0117, + "step": 214 + }, + { + "epoch": 3.4702702702702704, + "grad_norm": 0.4277520477771759, + "learning_rate": 3.027478243743106e-06, + "loss": 0.0115, + "step": 215 + }, + { + "epoch": 3.4864864864864864, + "grad_norm": 0.5382378697395325, + "learning_rate": 2.9985805766289815e-06, + "loss": 0.0141, + "step": 216 + }, + { + "epoch": 3.5027027027027025, + "grad_norm": 0.686087429523468, + "learning_rate": 2.9696133643758663e-06, + "loss": 0.0106, + "step": 217 + }, + { + "epoch": 3.518918918918919, + "grad_norm": 0.4279845356941223, + "learning_rate": 2.940580647511805e-06, + "loss": 0.0101, + "step": 218 + }, + { + "epoch": 3.535135135135135, + "grad_norm": 0.3966323435306549, + "learning_rate": 2.911486475701835e-06, + "loss": 0.0111, + "step": 219 + }, + { + "epoch": 3.5513513513513515, + "grad_norm": 0.4444655776023865, + "learning_rate": 2.8823349071831154e-06, + "loss": 0.0122, + "step": 220 + }, + { + "epoch": 3.5675675675675675, + "grad_norm": 0.4062931537628174, + "learning_rate": 2.853130008198855e-06, + "loss": 0.0088, + "step": 221 + }, + { + "epoch": 3.583783783783784, + "grad_norm": 0.4915783107280731, + "learning_rate": 2.8238758524311316e-06, + "loss": 0.0106, + "step": 222 + }, + { + "epoch": 3.6, + "grad_norm": 0.5415279865264893, + "learning_rate": 2.7945765204326664e-06, + "loss": 0.0102, + "step": 223 + }, + { + "epoch": 3.616216216216216, + "grad_norm": 0.5782752633094788, + "learning_rate": 2.7652360990576457e-06, + "loss": 0.0113, + "step": 224 + }, + { + "epoch": 3.6324324324324326, + "grad_norm": 0.5013785362243652, + "learning_rate": 2.735858680891656e-06, + "loss": 0.0092, + "step": 225 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 0.553318440914154, + "learning_rate": 2.7064483636808314e-06, + "loss": 0.0119, + "step": 226 + }, + { + "epoch": 3.6648648648648647, + "grad_norm": 0.47181862592697144, + "learning_rate": 2.677009249760268e-06, + "loss": 0.0152, + "step": 227 + }, + { + "epoch": 3.6810810810810812, + "grad_norm": 0.5877431631088257, + "learning_rate": 2.6475454454818072e-06, + "loss": 0.0181, + "step": 228 + }, + { + "epoch": 3.6972972972972973, + "grad_norm": 0.6693160533905029, + "learning_rate": 2.6180610606412587e-06, + "loss": 0.0168, + "step": 229 + }, + { + "epoch": 3.7135135135135133, + "grad_norm": 0.6764176487922668, + "learning_rate": 2.5885602079051354e-06, + "loss": 0.0128, + "step": 230 + }, + { + "epoch": 3.72972972972973, + "grad_norm": 0.5279078483581543, + "learning_rate": 2.559047002236995e-06, + "loss": 0.0099, + "step": 231 + }, + { + "epoch": 3.745945945945946, + "grad_norm": 0.6159639954566956, + "learning_rate": 2.529525560323462e-06, + "loss": 0.0122, + "step": 232 + }, + { + "epoch": 3.762162162162162, + "grad_norm": 0.558202862739563, + "learning_rate": 2.5e-06, + "loss": 0.0118, + "step": 233 + }, + { + "epoch": 3.7783783783783784, + "grad_norm": 0.37727731466293335, + "learning_rate": 2.470474439676539e-06, + "loss": 0.0099, + "step": 234 + }, + { + "epoch": 3.7945945945945945, + "grad_norm": 0.4426223039627075, + "learning_rate": 2.4409529977630052e-06, + "loss": 0.0104, + "step": 235 + }, + { + "epoch": 3.810810810810811, + "grad_norm": 0.48488032817840576, + "learning_rate": 2.411439792094866e-06, + "loss": 0.0145, + "step": 236 + }, + { + "epoch": 3.827027027027027, + "grad_norm": 0.4551326632499695, + "learning_rate": 2.381938939358742e-06, + "loss": 0.0107, + "step": 237 + }, + { + "epoch": 3.8432432432432435, + "grad_norm": 0.738146185874939, + "learning_rate": 2.3524545545181936e-06, + "loss": 0.0132, + "step": 238 + }, + { + "epoch": 3.8594594594594596, + "grad_norm": 0.5243131518363953, + "learning_rate": 2.322990750239733e-06, + "loss": 0.0093, + "step": 239 + }, + { + "epoch": 3.8756756756756756, + "grad_norm": 0.4127775728702545, + "learning_rate": 2.2935516363191695e-06, + "loss": 0.0131, + "step": 240 + }, + { + "epoch": 3.891891891891892, + "grad_norm": 0.5398023128509521, + "learning_rate": 2.2641413191083445e-06, + "loss": 0.0122, + "step": 241 + }, + { + "epoch": 3.908108108108108, + "grad_norm": 0.45481953024864197, + "learning_rate": 2.234763900942355e-06, + "loss": 0.0106, + "step": 242 + }, + { + "epoch": 3.924324324324324, + "grad_norm": 0.5116259455680847, + "learning_rate": 2.2054234795673336e-06, + "loss": 0.017, + "step": 243 + }, + { + "epoch": 3.9405405405405407, + "grad_norm": 0.4914882779121399, + "learning_rate": 2.1761241475688697e-06, + "loss": 0.0108, + "step": 244 + }, + { + "epoch": 3.9567567567567568, + "grad_norm": 0.42856982350349426, + "learning_rate": 2.146869991801146e-06, + "loss": 0.0111, + "step": 245 + }, + { + "epoch": 3.972972972972973, + "grad_norm": 0.45737922191619873, + "learning_rate": 2.117665092816885e-06, + "loss": 0.0095, + "step": 246 + }, + { + "epoch": 3.9891891891891893, + "grad_norm": 0.4869958758354187, + "learning_rate": 2.088513524298165e-06, + "loss": 0.0129, + "step": 247 + }, + { + "epoch": 4.0, + "grad_norm": 0.4869958758354187, + "learning_rate": 2.059419352488196e-06, + "loss": 0.0135, + "step": 248 + }, + { + "epoch": 4.0162162162162165, + "grad_norm": 0.8944225311279297, + "learning_rate": 2.030386635624135e-06, + "loss": 0.0067, + "step": 249 + }, + { + "epoch": 4.032432432432432, + "grad_norm": 0.3997219502925873, + "learning_rate": 2.0014194233710193e-06, + "loss": 0.0081, + "step": 250 + }, + { + "epoch": 4.048648648648649, + "grad_norm": 0.38661807775497437, + "learning_rate": 1.972521756256895e-06, + "loss": 0.0038, + "step": 251 + }, + { + "epoch": 4.064864864864865, + "grad_norm": 0.29563990235328674, + "learning_rate": 1.9436976651092143e-06, + "loss": 0.0066, + "step": 252 + }, + { + "epoch": 4.081081081081081, + "grad_norm": 0.254894495010376, + "learning_rate": 1.9149511704925945e-06, + "loss": 0.0058, + "step": 253 + }, + { + "epoch": 4.097297297297297, + "grad_norm": 0.23803076148033142, + "learning_rate": 1.8862862821480023e-06, + "loss": 0.0084, + "step": 254 + }, + { + "epoch": 4.113513513513514, + "grad_norm": 0.33866703510284424, + "learning_rate": 1.8577069984334522e-06, + "loss": 0.0099, + "step": 255 + }, + { + "epoch": 4.12972972972973, + "grad_norm": 0.35744938254356384, + "learning_rate": 1.829217305766289e-06, + "loss": 0.0064, + "step": 256 + }, + { + "epoch": 4.145945945945946, + "grad_norm": 0.2983056902885437, + "learning_rate": 1.8008211780671353e-06, + "loss": 0.0111, + "step": 257 + }, + { + "epoch": 4.162162162162162, + "grad_norm": 0.29860594868659973, + "learning_rate": 1.772522576205589e-06, + "loss": 0.0046, + "step": 258 + }, + { + "epoch": 4.178378378378379, + "grad_norm": 0.4390548765659332, + "learning_rate": 1.7443254474477328e-06, + "loss": 0.0105, + "step": 259 + }, + { + "epoch": 4.194594594594594, + "grad_norm": 0.34081289172172546, + "learning_rate": 1.7162337249055478e-06, + "loss": 0.0093, + "step": 260 + }, + { + "epoch": 4.210810810810811, + "grad_norm": 0.32730501890182495, + "learning_rate": 1.6882513269882916e-06, + "loss": 0.0084, + "step": 261 + }, + { + "epoch": 4.227027027027027, + "grad_norm": 0.7214042544364929, + "learning_rate": 1.6603821568559436e-06, + "loss": 0.0044, + "step": 262 + }, + { + "epoch": 4.243243243243243, + "grad_norm": 0.36692219972610474, + "learning_rate": 1.6326301018747623e-06, + "loss": 0.0065, + "step": 263 + }, + { + "epoch": 4.2594594594594595, + "grad_norm": 0.32951614260673523, + "learning_rate": 1.6049990330750508e-06, + "loss": 0.0055, + "step": 264 + }, + { + "epoch": 4.275675675675676, + "grad_norm": 0.33026283979415894, + "learning_rate": 1.5774928046112025e-06, + "loss": 0.0052, + "step": 265 + }, + { + "epoch": 4.291891891891892, + "grad_norm": 0.2922450006008148, + "learning_rate": 1.5501152532241003e-06, + "loss": 0.0071, + "step": 266 + }, + { + "epoch": 4.308108108108108, + "grad_norm": 0.37345102429389954, + "learning_rate": 1.522870197705943e-06, + "loss": 0.0067, + "step": 267 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 0.3973875343799591, + "learning_rate": 1.495761438367577e-06, + "loss": 0.0024, + "step": 268 + }, + { + "epoch": 4.34054054054054, + "grad_norm": 0.27959144115448, + "learning_rate": 1.4687927565084023e-06, + "loss": 0.0064, + "step": 269 + }, + { + "epoch": 4.356756756756757, + "grad_norm": 0.42801499366760254, + "learning_rate": 1.4419679138889379e-06, + "loss": 0.0074, + "step": 270 + }, + { + "epoch": 4.372972972972973, + "grad_norm": 0.5620250701904297, + "learning_rate": 1.415290652206105e-06, + "loss": 0.0085, + "step": 271 + }, + { + "epoch": 4.389189189189189, + "grad_norm": 0.4027979373931885, + "learning_rate": 1.3887646925713116e-06, + "loss": 0.0078, + "step": 272 + }, + { + "epoch": 4.405405405405405, + "grad_norm": 0.346365749835968, + "learning_rate": 1.3623937349914093e-06, + "loss": 0.0088, + "step": 273 + }, + { + "epoch": 4.421621621621622, + "grad_norm": 0.4886508584022522, + "learning_rate": 1.3361814578525922e-06, + "loss": 0.0058, + "step": 274 + }, + { + "epoch": 4.437837837837838, + "grad_norm": 0.33606165647506714, + "learning_rate": 1.3101315174073162e-06, + "loss": 0.0078, + "step": 275 + }, + { + "epoch": 4.454054054054054, + "grad_norm": 0.4041603207588196, + "learning_rate": 1.2842475472642969e-06, + "loss": 0.0037, + "step": 276 + }, + { + "epoch": 4.47027027027027, + "grad_norm": 0.9003347754478455, + "learning_rate": 1.258533157881674e-06, + "loss": 0.0054, + "step": 277 + }, + { + "epoch": 4.486486486486487, + "grad_norm": 0.5059202313423157, + "learning_rate": 1.2329919360634003e-06, + "loss": 0.0064, + "step": 278 + }, + { + "epoch": 4.5027027027027025, + "grad_norm": 0.4006575345993042, + "learning_rate": 1.2076274444589361e-06, + "loss": 0.0074, + "step": 279 + }, + { + "epoch": 4.518918918918919, + "grad_norm": 0.5821619033813477, + "learning_rate": 1.182443221066303e-06, + "loss": 0.0084, + "step": 280 + }, + { + "epoch": 4.535135135135135, + "grad_norm": 0.5989146828651428, + "learning_rate": 1.1574427787385853e-06, + "loss": 0.012, + "step": 281 + }, + { + "epoch": 4.551351351351351, + "grad_norm": 0.5475333333015442, + "learning_rate": 1.1326296046939334e-06, + "loss": 0.0035, + "step": 282 + }, + { + "epoch": 4.5675675675675675, + "grad_norm": 0.44198060035705566, + "learning_rate": 1.1080071600291453e-06, + "loss": 0.0055, + "step": 283 + }, + { + "epoch": 4.583783783783784, + "grad_norm": 1.231489658355713, + "learning_rate": 1.083578879236895e-06, + "loss": 0.0045, + "step": 284 + }, + { + "epoch": 4.6, + "grad_norm": 0.33816784620285034, + "learning_rate": 1.0593481697266582e-06, + "loss": 0.0076, + "step": 285 + }, + { + "epoch": 4.616216216216216, + "grad_norm": 0.5018823742866516, + "learning_rate": 1.0353184113494386e-06, + "loss": 0.0079, + "step": 286 + }, + { + "epoch": 4.632432432432433, + "grad_norm": 0.45690515637397766, + "learning_rate": 1.0114929559263122e-06, + "loss": 0.0061, + "step": 287 + }, + { + "epoch": 4.648648648648649, + "grad_norm": 0.4053182303905487, + "learning_rate": 9.878751267809069e-07, + "loss": 0.01, + "step": 288 + }, + { + "epoch": 4.664864864864865, + "grad_norm": 0.44401615858078003, + "learning_rate": 9.644682182758305e-07, + "loss": 0.0082, + "step": 289 + }, + { + "epoch": 4.681081081081081, + "grad_norm": 0.2947068214416504, + "learning_rate": 9.412754953531664e-07, + "loss": 0.0038, + "step": 290 + }, + { + "epoch": 4.697297297297297, + "grad_norm": 0.5023978352546692, + "learning_rate": 9.183001930790483e-07, + "loss": 0.0079, + "step": 291 + }, + { + "epoch": 4.713513513513513, + "grad_norm": 0.3027470111846924, + "learning_rate": 8.955455161924217e-07, + "loss": 0.0053, + "step": 292 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 0.4287405014038086, + "learning_rate": 8.730146386580157e-07, + "loss": 0.0059, + "step": 293 + }, + { + "epoch": 4.745945945945946, + "grad_norm": 0.292800635099411, + "learning_rate": 8.507107032236323e-07, + "loss": 0.0041, + "step": 294 + }, + { + "epoch": 4.762162162162162, + "grad_norm": 0.23428203165531158, + "learning_rate": 8.286368209817644e-07, + "loss": 0.009, + "step": 295 + }, + { + "epoch": 4.778378378378378, + "grad_norm": 0.48977068066596985, + "learning_rate": 8.067960709356479e-07, + "loss": 0.0054, + "step": 296 + }, + { + "epoch": 4.794594594594595, + "grad_norm": 0.46532517671585083, + "learning_rate": 7.851914995697801e-07, + "loss": 0.0056, + "step": 297 + }, + { + "epoch": 4.8108108108108105, + "grad_norm": 0.3898661732673645, + "learning_rate": 7.638261204249783e-07, + "loss": 0.0052, + "step": 298 + }, + { + "epoch": 4.827027027027027, + "grad_norm": 0.29227131605148315, + "learning_rate": 7.427029136780333e-07, + "loss": 0.0033, + "step": 299 + }, + { + "epoch": 4.8432432432432435, + "grad_norm": 0.36053887009620667, + "learning_rate": 7.218248257260127e-07, + "loss": 0.0128, + "step": 300 + }, + { + "epoch": 4.859459459459459, + "grad_norm": 0.4008624851703644, + "learning_rate": 7.011947687752804e-07, + "loss": 0.0106, + "step": 301 + }, + { + "epoch": 4.875675675675676, + "grad_norm": 0.5273243188858032, + "learning_rate": 6.808156204352845e-07, + "loss": 0.0081, + "step": 302 + }, + { + "epoch": 4.891891891891892, + "grad_norm": 0.328396201133728, + "learning_rate": 6.60690223317171e-07, + "loss": 0.0054, + "step": 303 + }, + { + "epoch": 4.908108108108108, + "grad_norm": 0.3451617360115051, + "learning_rate": 6.40821384637276e-07, + "loss": 0.007, + "step": 304 + }, + { + "epoch": 4.924324324324324, + "grad_norm": 0.4219134449958801, + "learning_rate": 6.212118758255595e-07, + "loss": 0.0099, + "step": 305 + } + ], + "logging_steps": 1, + "max_steps": 366, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 61, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.65389119118377e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-305/training_args.bin b/checkpoint-305/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5b28d107f55169977eced33ac6929abb398bb2c5 --- /dev/null +++ b/checkpoint-305/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e2f1aaf0f48ae52048eea3703205522237e597bd418f53d57d152ef3ad9cbbc +size 8056 diff --git a/checkpoint-305/zero_to_fp32.py b/checkpoint-305/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-305/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/checkpoint-366/README.md b/checkpoint-366/README.md new file mode 100644 index 0000000000000000000000000000000000000000..be5c87703f12b547886cc6a2ecfbe9ee150496fa --- /dev/null +++ b/checkpoint-366/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/checkpoint-366/adapter_config.json b/checkpoint-366/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..aaa71b6240dcb4147fb982eb2f0ff89574c4fb31 --- /dev/null +++ b/checkpoint-366/adapter_config.json @@ -0,0 +1,40 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 512, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "up_proj", + "gate_proj", + "v_proj", + "down_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-366/adapter_model.safetensors b/checkpoint-366/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..53315ad4415ef8b3e9fcdaf52f955692b9fe42c8 --- /dev/null +++ b/checkpoint-366/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f9101224637dbc63c919a24fefc3d52e1b9fc2dc8d5355d259947e5d493e419 +size 3443586272 diff --git a/checkpoint-366/global_step364/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-366/global_step364/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..63584a6b96106a7f9da8b7e056335018aa6c06dd --- /dev/null +++ b/checkpoint-366/global_step364/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ad7df0e1fa3e04728763bbc6ed63d33389abf3907f2f71dd290e25885f59c6b +size 20661195036 diff --git a/checkpoint-366/global_step364/mp_rank_00_model_states.pt b/checkpoint-366/global_step364/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5856a80be5985162730d019d8eebe720478fe7b --- /dev/null +++ b/checkpoint-366/global_step364/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4940242f9af4753853f5221f46ebe29e6892a46146b3cf894e37421f3f3da51c +size 3555326649 diff --git a/checkpoint-366/latest b/checkpoint-366/latest new file mode 100644 index 0000000000000000000000000000000000000000..533612ebd47200efc295f7f40c5c332c16c36bf6 --- /dev/null +++ b/checkpoint-366/latest @@ -0,0 +1 @@ +global_step364 \ No newline at end of file diff --git a/checkpoint-366/rng_state.pth b/checkpoint-366/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bc2ec734aed02f50f5c2b81ff7c57bdcdac2c74a --- /dev/null +++ b/checkpoint-366/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e12828203c5471d0fbe5e50c003fb936f3c3363ce1b5561ba3f55122e5a0f2be +size 14244 diff --git a/checkpoint-366/scheduler.pt b/checkpoint-366/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..367062f4fb89055a9cc2c4078ca0d4c0f67bba9b --- /dev/null +++ b/checkpoint-366/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b0a2f46e82f16e42afad794a3cb1775ba3931a614ff599b3c7d284293cf063 +size 1064 diff --git a/checkpoint-366/special_tokens_map.json b/checkpoint-366/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18 --- /dev/null +++ b/checkpoint-366/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-366/tokenizer.json b/checkpoint-366/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/checkpoint-366/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/checkpoint-366/tokenizer_config.json b/checkpoint-366/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca91a2ef55f4239a7af81d7c9abb05f53621a07b --- /dev/null +++ b/checkpoint-366/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|end_of_text|>", + "tokenizer_class": "PreTrainedTokenizer" +} diff --git a/checkpoint-366/trainer_state.json b/checkpoint-366/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3c334b321ac40c40d4ba68942011ef18733e1b66 --- /dev/null +++ b/checkpoint-366/trainer_state.json @@ -0,0 +1,2595 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.908108108108108, + "eval_steps": 500, + "global_step": 366, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016216216216216217, + "grad_norm": 39.12052917480469, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2957, + "step": 1 + }, + { + "epoch": 0.032432432432432434, + "grad_norm": 38.9581413269043, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.2959, + "step": 2 + }, + { + "epoch": 0.04864864864864865, + "grad_norm": 39.2702751159668, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.2677, + "step": 3 + }, + { + "epoch": 0.06486486486486487, + "grad_norm": 39.18815231323242, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.2936, + "step": 4 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 38.66701889038086, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.2561, + "step": 5 + }, + { + "epoch": 0.0972972972972973, + "grad_norm": 39.53536605834961, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.2579, + "step": 6 + }, + { + "epoch": 0.11351351351351352, + "grad_norm": 39.3793830871582, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.2627, + "step": 7 + }, + { + "epoch": 0.12972972972972974, + "grad_norm": 39.88922119140625, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.2729, + "step": 8 + }, + { + "epoch": 0.14594594594594595, + "grad_norm": 37.9880256652832, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.2311, + "step": 9 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 37.024139404296875, + "learning_rate": 5.000000000000001e-07, + "loss": 2.1773, + "step": 10 + }, + { + "epoch": 0.1783783783783784, + "grad_norm": 36.89325714111328, + "learning_rate": 5.5e-07, + "loss": 2.1927, + "step": 11 + }, + { + "epoch": 0.1945945945945946, + "grad_norm": 37.244178771972656, + "learning_rate": 6.000000000000001e-07, + "loss": 2.1757, + "step": 12 + }, + { + "epoch": 0.21081081081081082, + "grad_norm": 34.77650451660156, + "learning_rate": 6.5e-07, + "loss": 2.0392, + "step": 13 + }, + { + "epoch": 0.22702702702702704, + "grad_norm": 34.78818893432617, + "learning_rate": 7.000000000000001e-07, + "loss": 1.9996, + "step": 14 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 34.86852264404297, + "learning_rate": 7.5e-07, + "loss": 1.9496, + "step": 15 + }, + { + "epoch": 0.2594594594594595, + "grad_norm": 35.202796936035156, + "learning_rate": 8.000000000000001e-07, + "loss": 1.8542, + "step": 16 + }, + { + "epoch": 0.2756756756756757, + "grad_norm": 34.11354064941406, + "learning_rate": 8.500000000000001e-07, + "loss": 1.7118, + "step": 17 + }, + { + "epoch": 0.2918918918918919, + "grad_norm": 36.309059143066406, + "learning_rate": 9.000000000000001e-07, + "loss": 1.6834, + "step": 18 + }, + { + "epoch": 0.3081081081081081, + "grad_norm": 34.69994354248047, + "learning_rate": 9.500000000000001e-07, + "loss": 1.5298, + "step": 19 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 35.43153381347656, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.4191, + "step": 20 + }, + { + "epoch": 0.34054054054054056, + "grad_norm": 33.53745651245117, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.3068, + "step": 21 + }, + { + "epoch": 0.3567567567567568, + "grad_norm": 33.775604248046875, + "learning_rate": 1.1e-06, + "loss": 1.224, + "step": 22 + }, + { + "epoch": 0.372972972972973, + "grad_norm": 30.57005500793457, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.0704, + "step": 23 + }, + { + "epoch": 0.3891891891891892, + "grad_norm": 27.964860916137695, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.9548, + "step": 24 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 26.023576736450195, + "learning_rate": 1.25e-06, + "loss": 0.8503, + "step": 25 + }, + { + "epoch": 0.42162162162162165, + "grad_norm": 25.0452938079834, + "learning_rate": 1.3e-06, + "loss": 0.6938, + "step": 26 + }, + { + "epoch": 0.43783783783783786, + "grad_norm": 24.663373947143555, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.5648, + "step": 27 + }, + { + "epoch": 0.4540540540540541, + "grad_norm": 21.61736488342285, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.435, + "step": 28 + }, + { + "epoch": 0.4702702702702703, + "grad_norm": 18.3259334564209, + "learning_rate": 1.45e-06, + "loss": 0.3322, + "step": 29 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 16.80081558227539, + "learning_rate": 1.5e-06, + "loss": 0.2625, + "step": 30 + }, + { + "epoch": 0.5027027027027027, + "grad_norm": 14.789258003234863, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.1757, + "step": 31 + }, + { + "epoch": 0.518918918918919, + "grad_norm": 10.406538963317871, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.1376, + "step": 32 + }, + { + "epoch": 0.5351351351351351, + "grad_norm": 4.868802547454834, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.0815, + "step": 33 + }, + { + "epoch": 0.5513513513513514, + "grad_norm": 1.8639686107635498, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.0628, + "step": 34 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 1.897918462753296, + "learning_rate": 1.75e-06, + "loss": 0.0775, + "step": 35 + }, + { + "epoch": 0.5837837837837838, + "grad_norm": 1.296712040901184, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0565, + "step": 36 + }, + { + "epoch": 0.6, + "grad_norm": 1.0163214206695557, + "learning_rate": 1.85e-06, + "loss": 0.0544, + "step": 37 + }, + { + "epoch": 0.6162162162162163, + "grad_norm": 1.070162296295166, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.0621, + "step": 38 + }, + { + "epoch": 0.6324324324324324, + "grad_norm": 1.024267315864563, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.0566, + "step": 39 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.9016611576080322, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0511, + "step": 40 + }, + { + "epoch": 0.6648648648648648, + "grad_norm": 0.8272562623023987, + "learning_rate": 2.05e-06, + "loss": 0.0533, + "step": 41 + }, + { + "epoch": 0.6810810810810811, + "grad_norm": 0.8875278234481812, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.0495, + "step": 42 + }, + { + "epoch": 0.6972972972972973, + "grad_norm": 0.8804877996444702, + "learning_rate": 2.15e-06, + "loss": 0.0506, + "step": 43 + }, + { + "epoch": 0.7135135135135136, + "grad_norm": 0.7133358120918274, + "learning_rate": 2.2e-06, + "loss": 0.0467, + "step": 44 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 0.8142214417457581, + "learning_rate": 2.25e-06, + "loss": 0.0552, + "step": 45 + }, + { + "epoch": 0.745945945945946, + "grad_norm": 0.8341564536094666, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.0574, + "step": 46 + }, + { + "epoch": 0.7621621621621621, + "grad_norm": 0.6500507593154907, + "learning_rate": 2.35e-06, + "loss": 0.0398, + "step": 47 + }, + { + "epoch": 0.7783783783783784, + "grad_norm": 0.6163598895072937, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0459, + "step": 48 + }, + { + "epoch": 0.7945945945945946, + "grad_norm": 0.663949191570282, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.046, + "step": 49 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.7521553635597229, + "learning_rate": 2.5e-06, + "loss": 0.0525, + "step": 50 + }, + { + "epoch": 0.827027027027027, + "grad_norm": 0.7828383445739746, + "learning_rate": 2.55e-06, + "loss": 0.0558, + "step": 51 + }, + { + "epoch": 0.8432432432432433, + "grad_norm": 0.7935078740119934, + "learning_rate": 2.6e-06, + "loss": 0.0451, + "step": 52 + }, + { + "epoch": 0.8594594594594595, + "grad_norm": 0.6327880620956421, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.0403, + "step": 53 + }, + { + "epoch": 0.8756756756756757, + "grad_norm": 0.6185981035232544, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.0406, + "step": 54 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 0.5417979955673218, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.0426, + "step": 55 + }, + { + "epoch": 0.9081081081081082, + "grad_norm": 0.7140630483627319, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0446, + "step": 56 + }, + { + "epoch": 0.9243243243243243, + "grad_norm": 0.7191944122314453, + "learning_rate": 2.85e-06, + "loss": 0.047, + "step": 57 + }, + { + "epoch": 0.9405405405405406, + "grad_norm": 0.7562940716743469, + "learning_rate": 2.9e-06, + "loss": 0.0476, + "step": 58 + }, + { + "epoch": 0.9567567567567568, + "grad_norm": 0.7422239184379578, + "learning_rate": 2.95e-06, + "loss": 0.0462, + "step": 59 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 0.677144467830658, + "learning_rate": 3e-06, + "loss": 0.0475, + "step": 60 + }, + { + "epoch": 0.9891891891891892, + "grad_norm": 0.6127192974090576, + "learning_rate": 3.05e-06, + "loss": 0.0434, + "step": 61 + }, + { + "epoch": 1.0, + "grad_norm": 0.6127192974090576, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.0375, + "step": 62 + }, + { + "epoch": 1.0162162162162163, + "grad_norm": 0.959559440612793, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.0421, + "step": 63 + }, + { + "epoch": 1.0324324324324325, + "grad_norm": 0.6539880037307739, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0414, + "step": 64 + }, + { + "epoch": 1.0486486486486486, + "grad_norm": 0.5929313898086548, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.0451, + "step": 65 + }, + { + "epoch": 1.0648648648648649, + "grad_norm": 0.6479571461677551, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0415, + "step": 66 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.5496926307678223, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.0366, + "step": 67 + }, + { + "epoch": 1.0972972972972972, + "grad_norm": 0.5373682379722595, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0383, + "step": 68 + }, + { + "epoch": 1.1135135135135135, + "grad_norm": 0.5489712357521057, + "learning_rate": 3.45e-06, + "loss": 0.0427, + "step": 69 + }, + { + "epoch": 1.1297297297297297, + "grad_norm": 0.6830047369003296, + "learning_rate": 3.5e-06, + "loss": 0.039, + "step": 70 + }, + { + "epoch": 1.145945945945946, + "grad_norm": 0.5794199705123901, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.0409, + "step": 71 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 0.571513831615448, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0392, + "step": 72 + }, + { + "epoch": 1.1783783783783783, + "grad_norm": 0.7753933668136597, + "learning_rate": 3.65e-06, + "loss": 0.0365, + "step": 73 + }, + { + "epoch": 1.1945945945945946, + "grad_norm": 0.6135310530662537, + "learning_rate": 3.7e-06, + "loss": 0.036, + "step": 74 + }, + { + "epoch": 1.2108108108108109, + "grad_norm": 0.5497344136238098, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.035, + "step": 75 + }, + { + "epoch": 1.227027027027027, + "grad_norm": 0.5861782431602478, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0434, + "step": 76 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 0.6941010355949402, + "learning_rate": 3.85e-06, + "loss": 0.0336, + "step": 77 + }, + { + "epoch": 1.2594594594594595, + "grad_norm": 0.5305830240249634, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0391, + "step": 78 + }, + { + "epoch": 1.2756756756756757, + "grad_norm": 0.6456385254859924, + "learning_rate": 3.95e-06, + "loss": 0.0422, + "step": 79 + }, + { + "epoch": 1.291891891891892, + "grad_norm": 0.5704363584518433, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0342, + "step": 80 + }, + { + "epoch": 1.308108108108108, + "grad_norm": 0.5257390141487122, + "learning_rate": 4.05e-06, + "loss": 0.0369, + "step": 81 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 0.5541989207267761, + "learning_rate": 4.1e-06, + "loss": 0.0331, + "step": 82 + }, + { + "epoch": 1.3405405405405406, + "grad_norm": 0.7190688252449036, + "learning_rate": 4.15e-06, + "loss": 0.039, + "step": 83 + }, + { + "epoch": 1.3567567567567567, + "grad_norm": 0.4766721725463867, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0354, + "step": 84 + }, + { + "epoch": 1.372972972972973, + "grad_norm": 0.5847981572151184, + "learning_rate": 4.25e-06, + "loss": 0.0355, + "step": 85 + }, + { + "epoch": 1.3891891891891892, + "grad_norm": 0.6361181139945984, + "learning_rate": 4.3e-06, + "loss": 0.0415, + "step": 86 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 0.6437036395072937, + "learning_rate": 4.350000000000001e-06, + "loss": 0.0353, + "step": 87 + }, + { + "epoch": 1.4216216216216218, + "grad_norm": 0.712043046951294, + "learning_rate": 4.4e-06, + "loss": 0.0311, + "step": 88 + }, + { + "epoch": 1.4378378378378378, + "grad_norm": 0.5829771757125854, + "learning_rate": 4.450000000000001e-06, + "loss": 0.0433, + "step": 89 + }, + { + "epoch": 1.454054054054054, + "grad_norm": 0.6977937817573547, + "learning_rate": 4.5e-06, + "loss": 0.0391, + "step": 90 + }, + { + "epoch": 1.4702702702702704, + "grad_norm": 0.49931228160858154, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.0352, + "step": 91 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.5281490683555603, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0385, + "step": 92 + }, + { + "epoch": 1.5027027027027027, + "grad_norm": 0.613349974155426, + "learning_rate": 4.65e-06, + "loss": 0.0399, + "step": 93 + }, + { + "epoch": 1.518918918918919, + "grad_norm": 0.6584879755973816, + "learning_rate": 4.7e-06, + "loss": 0.043, + "step": 94 + }, + { + "epoch": 1.535135135135135, + "grad_norm": 0.6006895303726196, + "learning_rate": 4.75e-06, + "loss": 0.0372, + "step": 95 + }, + { + "epoch": 1.5513513513513515, + "grad_norm": 0.5364943146705627, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0384, + "step": 96 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 0.4963968098163605, + "learning_rate": 4.85e-06, + "loss": 0.0324, + "step": 97 + }, + { + "epoch": 1.5837837837837838, + "grad_norm": 0.5868538618087769, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0386, + "step": 98 + }, + { + "epoch": 1.6, + "grad_norm": 0.6690974235534668, + "learning_rate": 4.95e-06, + "loss": 0.0332, + "step": 99 + }, + { + "epoch": 1.6162162162162161, + "grad_norm": 0.6118388175964355, + "learning_rate": 5e-06, + "loss": 0.0398, + "step": 100 + }, + { + "epoch": 1.6324324324324324, + "grad_norm": 0.6872304677963257, + "learning_rate": 4.999825642177387e-06, + "loss": 0.0333, + "step": 101 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 0.6457200646400452, + "learning_rate": 4.999302593030069e-06, + "loss": 0.0381, + "step": 102 + }, + { + "epoch": 1.6648648648648647, + "grad_norm": 0.6096416115760803, + "learning_rate": 4.998430925516213e-06, + "loss": 0.0385, + "step": 103 + }, + { + "epoch": 1.6810810810810812, + "grad_norm": 0.582796573638916, + "learning_rate": 4.99721076122146e-06, + "loss": 0.0317, + "step": 104 + }, + { + "epoch": 1.6972972972972973, + "grad_norm": 0.5576394200325012, + "learning_rate": 4.995642270341961e-06, + "loss": 0.0378, + "step": 105 + }, + { + "epoch": 1.7135135135135136, + "grad_norm": 0.7414760589599609, + "learning_rate": 4.99372567166064e-06, + "loss": 0.0403, + "step": 106 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 0.6029103994369507, + "learning_rate": 4.991461232516675e-06, + "loss": 0.0418, + "step": 107 + }, + { + "epoch": 1.7459459459459459, + "grad_norm": 0.771609365940094, + "learning_rate": 4.98884926876821e-06, + "loss": 0.0413, + "step": 108 + }, + { + "epoch": 1.7621621621621621, + "grad_norm": 0.6869891285896301, + "learning_rate": 4.9858901447482924e-06, + "loss": 0.0367, + "step": 109 + }, + { + "epoch": 1.7783783783783784, + "grad_norm": 0.4931647479534149, + "learning_rate": 4.982584273214061e-06, + "loss": 0.033, + "step": 110 + }, + { + "epoch": 1.7945945945945945, + "grad_norm": 0.5160052180290222, + "learning_rate": 4.978932115289165e-06, + "loss": 0.0357, + "step": 111 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 0.49750861525535583, + "learning_rate": 4.974934180399447e-06, + "loss": 0.0333, + "step": 112 + }, + { + "epoch": 1.827027027027027, + "grad_norm": 0.6596441864967346, + "learning_rate": 4.970591026201884e-06, + "loss": 0.0354, + "step": 113 + }, + { + "epoch": 1.8432432432432433, + "grad_norm": 0.6613579988479614, + "learning_rate": 4.965903258506806e-06, + "loss": 0.0377, + "step": 114 + }, + { + "epoch": 1.8594594594594596, + "grad_norm": 0.5383866429328918, + "learning_rate": 4.9608715311933865e-06, + "loss": 0.0418, + "step": 115 + }, + { + "epoch": 1.8756756756756756, + "grad_norm": 0.6303413510322571, + "learning_rate": 4.955496546118439e-06, + "loss": 0.0351, + "step": 116 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.5293605923652649, + "learning_rate": 4.949779053018519e-06, + "loss": 0.0322, + "step": 117 + }, + { + "epoch": 1.9081081081081082, + "grad_norm": 0.5211143493652344, + "learning_rate": 4.943719849405347e-06, + "loss": 0.0374, + "step": 118 + }, + { + "epoch": 1.9243243243243242, + "grad_norm": 0.5933778882026672, + "learning_rate": 4.937319780454559e-06, + "loss": 0.0377, + "step": 119 + }, + { + "epoch": 1.9405405405405407, + "grad_norm": 0.6020687818527222, + "learning_rate": 4.930579738887827e-06, + "loss": 0.0313, + "step": 120 + }, + { + "epoch": 1.9567567567567568, + "grad_norm": 0.7828154563903809, + "learning_rate": 4.923500664848327e-06, + "loss": 0.0372, + "step": 121 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 0.6172424554824829, + "learning_rate": 4.9160835457696075e-06, + "loss": 0.0387, + "step": 122 + }, + { + "epoch": 1.9891891891891893, + "grad_norm": 0.5671921372413635, + "learning_rate": 4.9083294162378545e-06, + "loss": 0.0346, + "step": 123 + }, + { + "epoch": 2.0, + "grad_norm": 1.0704405307769775, + "learning_rate": 4.900239357847582e-06, + "loss": 0.0298, + "step": 124 + }, + { + "epoch": 2.016216216216216, + "grad_norm": 0.5932011604309082, + "learning_rate": 4.891814499050762e-06, + "loss": 0.0243, + "step": 125 + }, + { + "epoch": 2.0324324324324325, + "grad_norm": 0.47397834062576294, + "learning_rate": 4.883056014999423e-06, + "loss": 0.0281, + "step": 126 + }, + { + "epoch": 2.0486486486486486, + "grad_norm": 0.538270115852356, + "learning_rate": 4.873965127381734e-06, + "loss": 0.0268, + "step": 127 + }, + { + "epoch": 2.064864864864865, + "grad_norm": 0.3924686908721924, + "learning_rate": 4.864543104251587e-06, + "loss": 0.02, + "step": 128 + }, + { + "epoch": 2.081081081081081, + "grad_norm": 0.5162842869758606, + "learning_rate": 4.854791259851735e-06, + "loss": 0.0237, + "step": 129 + }, + { + "epoch": 2.097297297297297, + "grad_norm": 0.4691126048564911, + "learning_rate": 4.844710954430464e-06, + "loss": 0.0224, + "step": 130 + }, + { + "epoch": 2.1135135135135137, + "grad_norm": 0.47650063037872314, + "learning_rate": 4.834303594051854e-06, + "loss": 0.0202, + "step": 131 + }, + { + "epoch": 2.1297297297297297, + "grad_norm": 0.5041627883911133, + "learning_rate": 4.823570630399665e-06, + "loss": 0.0228, + "step": 132 + }, + { + "epoch": 2.145945945945946, + "grad_norm": 0.5368483662605286, + "learning_rate": 4.812513560574832e-06, + "loss": 0.0241, + "step": 133 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.7245975732803345, + "learning_rate": 4.8011339268866505e-06, + "loss": 0.0294, + "step": 134 + }, + { + "epoch": 2.1783783783783783, + "grad_norm": 0.5506283044815063, + "learning_rate": 4.789433316637644e-06, + "loss": 0.0204, + "step": 135 + }, + { + "epoch": 2.1945945945945944, + "grad_norm": 0.5240617990493774, + "learning_rate": 4.777413361902152e-06, + "loss": 0.0227, + "step": 136 + }, + { + "epoch": 2.210810810810811, + "grad_norm": 0.6164438128471375, + "learning_rate": 4.765075739298683e-06, + "loss": 0.0209, + "step": 137 + }, + { + "epoch": 2.227027027027027, + "grad_norm": 0.551898181438446, + "learning_rate": 4.752422169756048e-06, + "loss": 0.0187, + "step": 138 + }, + { + "epoch": 2.2432432432432434, + "grad_norm": 0.45092299580574036, + "learning_rate": 4.739454418273314e-06, + "loss": 0.0281, + "step": 139 + }, + { + "epoch": 2.2594594594594595, + "grad_norm": 0.48173126578330994, + "learning_rate": 4.726174293673612e-06, + "loss": 0.0213, + "step": 140 + }, + { + "epoch": 2.2756756756756755, + "grad_norm": 0.48536229133605957, + "learning_rate": 4.712583648351827e-06, + "loss": 0.0204, + "step": 141 + }, + { + "epoch": 2.291891891891892, + "grad_norm": 0.4885499179363251, + "learning_rate": 4.698684378016223e-06, + "loss": 0.0225, + "step": 142 + }, + { + "epoch": 2.308108108108108, + "grad_norm": 0.40719687938690186, + "learning_rate": 4.684478421424007e-06, + "loss": 0.0205, + "step": 143 + }, + { + "epoch": 2.3243243243243246, + "grad_norm": 0.4365272521972656, + "learning_rate": 4.669967760110908e-06, + "loss": 0.0224, + "step": 144 + }, + { + "epoch": 2.3405405405405406, + "grad_norm": 0.4639301300048828, + "learning_rate": 4.655154418114774e-06, + "loss": 0.0256, + "step": 145 + }, + { + "epoch": 2.3567567567567567, + "grad_norm": 0.47420835494995117, + "learning_rate": 4.6400404616932505e-06, + "loss": 0.0208, + "step": 146 + }, + { + "epoch": 2.372972972972973, + "grad_norm": 0.5030474066734314, + "learning_rate": 4.624627999035564e-06, + "loss": 0.0255, + "step": 147 + }, + { + "epoch": 2.389189189189189, + "grad_norm": 0.47888803482055664, + "learning_rate": 4.608919179968457e-06, + "loss": 0.0241, + "step": 148 + }, + { + "epoch": 2.4054054054054053, + "grad_norm": 0.602581262588501, + "learning_rate": 4.592916195656322e-06, + "loss": 0.0243, + "step": 149 + }, + { + "epoch": 2.4216216216216218, + "grad_norm": 0.6816417574882507, + "learning_rate": 4.576621278295558e-06, + "loss": 0.0259, + "step": 150 + }, + { + "epoch": 2.437837837837838, + "grad_norm": 0.6839447617530823, + "learning_rate": 4.5600367008032135e-06, + "loss": 0.0247, + "step": 151 + }, + { + "epoch": 2.454054054054054, + "grad_norm": 0.496794193983078, + "learning_rate": 4.543164776499945e-06, + "loss": 0.0244, + "step": 152 + }, + { + "epoch": 2.4702702702702704, + "grad_norm": 0.4372956156730652, + "learning_rate": 4.5260078587873416e-06, + "loss": 0.0208, + "step": 153 + }, + { + "epoch": 2.4864864864864864, + "grad_norm": 0.6199434399604797, + "learning_rate": 4.508568340819654e-06, + "loss": 0.028, + "step": 154 + }, + { + "epoch": 2.5027027027027025, + "grad_norm": 0.6074104905128479, + "learning_rate": 4.490848655169986e-06, + "loss": 0.0278, + "step": 155 + }, + { + "epoch": 2.518918918918919, + "grad_norm": 0.5419324636459351, + "learning_rate": 4.472851273490985e-06, + "loss": 0.0181, + "step": 156 + }, + { + "epoch": 2.535135135135135, + "grad_norm": 0.4877943992614746, + "learning_rate": 4.454578706170075e-06, + "loss": 0.0214, + "step": 157 + }, + { + "epoch": 2.5513513513513515, + "grad_norm": 0.5049244165420532, + "learning_rate": 4.436033501979299e-06, + "loss": 0.0214, + "step": 158 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.45928511023521423, + "learning_rate": 4.417218247719794e-06, + "loss": 0.0167, + "step": 159 + }, + { + "epoch": 2.583783783783784, + "grad_norm": 0.5185860395431519, + "learning_rate": 4.398135567860972e-06, + "loss": 0.0243, + "step": 160 + }, + { + "epoch": 2.6, + "grad_norm": 0.3984812796115875, + "learning_rate": 4.378788124174441e-06, + "loss": 0.0201, + "step": 161 + }, + { + "epoch": 2.616216216216216, + "grad_norm": 0.607692301273346, + "learning_rate": 4.359178615362725e-06, + "loss": 0.0247, + "step": 162 + }, + { + "epoch": 2.6324324324324326, + "grad_norm": 0.5436367988586426, + "learning_rate": 4.33930977668283e-06, + "loss": 0.0204, + "step": 163 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 0.6367728114128113, + "learning_rate": 4.319184379564716e-06, + "loss": 0.0222, + "step": 164 + }, + { + "epoch": 2.6648648648648647, + "grad_norm": 0.5538708567619324, + "learning_rate": 4.298805231224721e-06, + "loss": 0.0215, + "step": 165 + }, + { + "epoch": 2.6810810810810812, + "grad_norm": 0.5421778559684753, + "learning_rate": 4.278175174273989e-06, + "loss": 0.0201, + "step": 166 + }, + { + "epoch": 2.6972972972972973, + "grad_norm": 0.6123104691505432, + "learning_rate": 4.257297086321967e-06, + "loss": 0.0209, + "step": 167 + }, + { + "epoch": 2.7135135135135133, + "grad_norm": 0.6386194229125977, + "learning_rate": 4.236173879575022e-06, + "loss": 0.0214, + "step": 168 + }, + { + "epoch": 2.72972972972973, + "grad_norm": 0.6019391417503357, + "learning_rate": 4.2148085004302205e-06, + "loss": 0.0246, + "step": 169 + }, + { + "epoch": 2.745945945945946, + "grad_norm": 0.5638225674629211, + "learning_rate": 4.1932039290643534e-06, + "loss": 0.0189, + "step": 170 + }, + { + "epoch": 2.762162162162162, + "grad_norm": 0.6640142202377319, + "learning_rate": 4.1713631790182366e-06, + "loss": 0.0236, + "step": 171 + }, + { + "epoch": 2.7783783783783784, + "grad_norm": 0.5170625448226929, + "learning_rate": 4.149289296776369e-06, + "loss": 0.0203, + "step": 172 + }, + { + "epoch": 2.7945945945945945, + "grad_norm": 0.5130777955055237, + "learning_rate": 4.126985361341984e-06, + "loss": 0.0195, + "step": 173 + }, + { + "epoch": 2.810810810810811, + "grad_norm": 0.5125660300254822, + "learning_rate": 4.104454483807579e-06, + "loss": 0.0229, + "step": 174 + }, + { + "epoch": 2.827027027027027, + "grad_norm": 0.573662281036377, + "learning_rate": 4.0816998069209516e-06, + "loss": 0.0202, + "step": 175 + }, + { + "epoch": 2.8432432432432435, + "grad_norm": 0.6013869643211365, + "learning_rate": 4.058724504646834e-06, + "loss": 0.0319, + "step": 176 + }, + { + "epoch": 2.8594594594594596, + "grad_norm": 0.5050269365310669, + "learning_rate": 4.0355317817241705e-06, + "loss": 0.0189, + "step": 177 + }, + { + "epoch": 2.8756756756756756, + "grad_norm": 0.5249143838882446, + "learning_rate": 4.012124873219094e-06, + "loss": 0.0214, + "step": 178 + }, + { + "epoch": 2.891891891891892, + "grad_norm": 0.5124053955078125, + "learning_rate": 3.988507044073687e-06, + "loss": 0.0162, + "step": 179 + }, + { + "epoch": 2.908108108108108, + "grad_norm": 0.4640377461910248, + "learning_rate": 3.964681588650562e-06, + "loss": 0.0189, + "step": 180 + }, + { + "epoch": 2.924324324324324, + "grad_norm": 0.6197820901870728, + "learning_rate": 3.940651830273342e-06, + "loss": 0.0237, + "step": 181 + }, + { + "epoch": 2.9405405405405407, + "grad_norm": 0.6041496992111206, + "learning_rate": 3.916421120763106e-06, + "loss": 0.0241, + "step": 182 + }, + { + "epoch": 2.9567567567567568, + "grad_norm": 0.5259250402450562, + "learning_rate": 3.891992839970855e-06, + "loss": 0.0207, + "step": 183 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.6110473871231079, + "learning_rate": 3.8673703953060685e-06, + "loss": 0.0199, + "step": 184 + }, + { + "epoch": 2.9891891891891893, + "grad_norm": 0.504909098148346, + "learning_rate": 3.8425572212614155e-06, + "loss": 0.0211, + "step": 185 + }, + { + "epoch": 3.0, + "grad_norm": 1.010295033454895, + "learning_rate": 3.817556778933697e-06, + "loss": 0.0195, + "step": 186 + }, + { + "epoch": 3.016216216216216, + "grad_norm": 0.3988107442855835, + "learning_rate": 3.792372555541064e-06, + "loss": 0.0131, + "step": 187 + }, + { + "epoch": 3.0324324324324325, + "grad_norm": 0.31533047556877136, + "learning_rate": 3.7670080639366e-06, + "loss": 0.0107, + "step": 188 + }, + { + "epoch": 3.0486486486486486, + "grad_norm": 0.4819331765174866, + "learning_rate": 3.741466842118327e-06, + "loss": 0.0135, + "step": 189 + }, + { + "epoch": 3.064864864864865, + "grad_norm": 0.33277931809425354, + "learning_rate": 3.7157524527357036e-06, + "loss": 0.0112, + "step": 190 + }, + { + "epoch": 3.081081081081081, + "grad_norm": 0.3936960697174072, + "learning_rate": 3.6898684825926845e-06, + "loss": 0.0125, + "step": 191 + }, + { + "epoch": 3.097297297297297, + "grad_norm": 0.46424582600593567, + "learning_rate": 3.663818542147409e-06, + "loss": 0.0171, + "step": 192 + }, + { + "epoch": 3.1135135135135137, + "grad_norm": 0.4771481156349182, + "learning_rate": 3.6376062650085918e-06, + "loss": 0.0121, + "step": 193 + }, + { + "epoch": 3.1297297297297297, + "grad_norm": 0.4661300778388977, + "learning_rate": 3.61123530742869e-06, + "loss": 0.0164, + "step": 194 + }, + { + "epoch": 3.145945945945946, + "grad_norm": 0.424891859292984, + "learning_rate": 3.5847093477938955e-06, + "loss": 0.0116, + "step": 195 + }, + { + "epoch": 3.1621621621621623, + "grad_norm": 0.4026256501674652, + "learning_rate": 3.5580320861110627e-06, + "loss": 0.0135, + "step": 196 + }, + { + "epoch": 3.1783783783783783, + "grad_norm": 0.4946088492870331, + "learning_rate": 3.5312072434915983e-06, + "loss": 0.0104, + "step": 197 + }, + { + "epoch": 3.1945945945945944, + "grad_norm": 0.3920349180698395, + "learning_rate": 3.5042385616324243e-06, + "loss": 0.0127, + "step": 198 + }, + { + "epoch": 3.210810810810811, + "grad_norm": 0.35541996359825134, + "learning_rate": 3.477129802294057e-06, + "loss": 0.0095, + "step": 199 + }, + { + "epoch": 3.227027027027027, + "grad_norm": 0.4382397532463074, + "learning_rate": 3.4498847467759e-06, + "loss": 0.009, + "step": 200 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 0.4411066174507141, + "learning_rate": 3.4225071953887977e-06, + "loss": 0.0103, + "step": 201 + }, + { + "epoch": 3.2594594594594595, + "grad_norm": 0.4458081126213074, + "learning_rate": 3.3950009669249502e-06, + "loss": 0.011, + "step": 202 + }, + { + "epoch": 3.2756756756756755, + "grad_norm": 0.6307731866836548, + "learning_rate": 3.3673698981252385e-06, + "loss": 0.0123, + "step": 203 + }, + { + "epoch": 3.291891891891892, + "grad_norm": 0.48209428787231445, + "learning_rate": 3.3396178431440572e-06, + "loss": 0.0097, + "step": 204 + }, + { + "epoch": 3.308108108108108, + "grad_norm": 0.5341079831123352, + "learning_rate": 3.3117486730117092e-06, + "loss": 0.0098, + "step": 205 + }, + { + "epoch": 3.3243243243243246, + "grad_norm": 0.6222304701805115, + "learning_rate": 3.283766275094454e-06, + "loss": 0.0153, + "step": 206 + }, + { + "epoch": 3.3405405405405406, + "grad_norm": 0.6000131964683533, + "learning_rate": 3.255674552552267e-06, + "loss": 0.0124, + "step": 207 + }, + { + "epoch": 3.3567567567567567, + "grad_norm": 0.573101818561554, + "learning_rate": 3.227477423794412e-06, + "loss": 0.0111, + "step": 208 + }, + { + "epoch": 3.372972972972973, + "grad_norm": 0.49702250957489014, + "learning_rate": 3.1991788219328657e-06, + "loss": 0.0125, + "step": 209 + }, + { + "epoch": 3.389189189189189, + "grad_norm": 0.4762848913669586, + "learning_rate": 3.1707826942337124e-06, + "loss": 0.0089, + "step": 210 + }, + { + "epoch": 3.4054054054054053, + "grad_norm": 0.671346127986908, + "learning_rate": 3.142293001566548e-06, + "loss": 0.0128, + "step": 211 + }, + { + "epoch": 3.4216216216216218, + "grad_norm": 0.5226602554321289, + "learning_rate": 3.1137137178519983e-06, + "loss": 0.0119, + "step": 212 + }, + { + "epoch": 3.437837837837838, + "grad_norm": 0.40818020701408386, + "learning_rate": 3.085048829507406e-06, + "loss": 0.0097, + "step": 213 + }, + { + "epoch": 3.454054054054054, + "grad_norm": 0.5493360161781311, + "learning_rate": 3.056302334890786e-06, + "loss": 0.0117, + "step": 214 + }, + { + "epoch": 3.4702702702702704, + "grad_norm": 0.4277520477771759, + "learning_rate": 3.027478243743106e-06, + "loss": 0.0115, + "step": 215 + }, + { + "epoch": 3.4864864864864864, + "grad_norm": 0.5382378697395325, + "learning_rate": 2.9985805766289815e-06, + "loss": 0.0141, + "step": 216 + }, + { + "epoch": 3.5027027027027025, + "grad_norm": 0.686087429523468, + "learning_rate": 2.9696133643758663e-06, + "loss": 0.0106, + "step": 217 + }, + { + "epoch": 3.518918918918919, + "grad_norm": 0.4279845356941223, + "learning_rate": 2.940580647511805e-06, + "loss": 0.0101, + "step": 218 + }, + { + "epoch": 3.535135135135135, + "grad_norm": 0.3966323435306549, + "learning_rate": 2.911486475701835e-06, + "loss": 0.0111, + "step": 219 + }, + { + "epoch": 3.5513513513513515, + "grad_norm": 0.4444655776023865, + "learning_rate": 2.8823349071831154e-06, + "loss": 0.0122, + "step": 220 + }, + { + "epoch": 3.5675675675675675, + "grad_norm": 0.4062931537628174, + "learning_rate": 2.853130008198855e-06, + "loss": 0.0088, + "step": 221 + }, + { + "epoch": 3.583783783783784, + "grad_norm": 0.4915783107280731, + "learning_rate": 2.8238758524311316e-06, + "loss": 0.0106, + "step": 222 + }, + { + "epoch": 3.6, + "grad_norm": 0.5415279865264893, + "learning_rate": 2.7945765204326664e-06, + "loss": 0.0102, + "step": 223 + }, + { + "epoch": 3.616216216216216, + "grad_norm": 0.5782752633094788, + "learning_rate": 2.7652360990576457e-06, + "loss": 0.0113, + "step": 224 + }, + { + "epoch": 3.6324324324324326, + "grad_norm": 0.5013785362243652, + "learning_rate": 2.735858680891656e-06, + "loss": 0.0092, + "step": 225 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 0.553318440914154, + "learning_rate": 2.7064483636808314e-06, + "loss": 0.0119, + "step": 226 + }, + { + "epoch": 3.6648648648648647, + "grad_norm": 0.47181862592697144, + "learning_rate": 2.677009249760268e-06, + "loss": 0.0152, + "step": 227 + }, + { + "epoch": 3.6810810810810812, + "grad_norm": 0.5877431631088257, + "learning_rate": 2.6475454454818072e-06, + "loss": 0.0181, + "step": 228 + }, + { + "epoch": 3.6972972972972973, + "grad_norm": 0.6693160533905029, + "learning_rate": 2.6180610606412587e-06, + "loss": 0.0168, + "step": 229 + }, + { + "epoch": 3.7135135135135133, + "grad_norm": 0.6764176487922668, + "learning_rate": 2.5885602079051354e-06, + "loss": 0.0128, + "step": 230 + }, + { + "epoch": 3.72972972972973, + "grad_norm": 0.5279078483581543, + "learning_rate": 2.559047002236995e-06, + "loss": 0.0099, + "step": 231 + }, + { + "epoch": 3.745945945945946, + "grad_norm": 0.6159639954566956, + "learning_rate": 2.529525560323462e-06, + "loss": 0.0122, + "step": 232 + }, + { + "epoch": 3.762162162162162, + "grad_norm": 0.558202862739563, + "learning_rate": 2.5e-06, + "loss": 0.0118, + "step": 233 + }, + { + "epoch": 3.7783783783783784, + "grad_norm": 0.37727731466293335, + "learning_rate": 2.470474439676539e-06, + "loss": 0.0099, + "step": 234 + }, + { + "epoch": 3.7945945945945945, + "grad_norm": 0.4426223039627075, + "learning_rate": 2.4409529977630052e-06, + "loss": 0.0104, + "step": 235 + }, + { + "epoch": 3.810810810810811, + "grad_norm": 0.48488032817840576, + "learning_rate": 2.411439792094866e-06, + "loss": 0.0145, + "step": 236 + }, + { + "epoch": 3.827027027027027, + "grad_norm": 0.4551326632499695, + "learning_rate": 2.381938939358742e-06, + "loss": 0.0107, + "step": 237 + }, + { + "epoch": 3.8432432432432435, + "grad_norm": 0.738146185874939, + "learning_rate": 2.3524545545181936e-06, + "loss": 0.0132, + "step": 238 + }, + { + "epoch": 3.8594594594594596, + "grad_norm": 0.5243131518363953, + "learning_rate": 2.322990750239733e-06, + "loss": 0.0093, + "step": 239 + }, + { + "epoch": 3.8756756756756756, + "grad_norm": 0.4127775728702545, + "learning_rate": 2.2935516363191695e-06, + "loss": 0.0131, + "step": 240 + }, + { + "epoch": 3.891891891891892, + "grad_norm": 0.5398023128509521, + "learning_rate": 2.2641413191083445e-06, + "loss": 0.0122, + "step": 241 + }, + { + "epoch": 3.908108108108108, + "grad_norm": 0.45481953024864197, + "learning_rate": 2.234763900942355e-06, + "loss": 0.0106, + "step": 242 + }, + { + "epoch": 3.924324324324324, + "grad_norm": 0.5116259455680847, + "learning_rate": 2.2054234795673336e-06, + "loss": 0.017, + "step": 243 + }, + { + "epoch": 3.9405405405405407, + "grad_norm": 0.4914882779121399, + "learning_rate": 2.1761241475688697e-06, + "loss": 0.0108, + "step": 244 + }, + { + "epoch": 3.9567567567567568, + "grad_norm": 0.42856982350349426, + "learning_rate": 2.146869991801146e-06, + "loss": 0.0111, + "step": 245 + }, + { + "epoch": 3.972972972972973, + "grad_norm": 0.45737922191619873, + "learning_rate": 2.117665092816885e-06, + "loss": 0.0095, + "step": 246 + }, + { + "epoch": 3.9891891891891893, + "grad_norm": 0.4869958758354187, + "learning_rate": 2.088513524298165e-06, + "loss": 0.0129, + "step": 247 + }, + { + "epoch": 4.0, + "grad_norm": 0.4869958758354187, + "learning_rate": 2.059419352488196e-06, + "loss": 0.0135, + "step": 248 + }, + { + "epoch": 4.0162162162162165, + "grad_norm": 0.8944225311279297, + "learning_rate": 2.030386635624135e-06, + "loss": 0.0067, + "step": 249 + }, + { + "epoch": 4.032432432432432, + "grad_norm": 0.3997219502925873, + "learning_rate": 2.0014194233710193e-06, + "loss": 0.0081, + "step": 250 + }, + { + "epoch": 4.048648648648649, + "grad_norm": 0.38661807775497437, + "learning_rate": 1.972521756256895e-06, + "loss": 0.0038, + "step": 251 + }, + { + "epoch": 4.064864864864865, + "grad_norm": 0.29563990235328674, + "learning_rate": 1.9436976651092143e-06, + "loss": 0.0066, + "step": 252 + }, + { + "epoch": 4.081081081081081, + "grad_norm": 0.254894495010376, + "learning_rate": 1.9149511704925945e-06, + "loss": 0.0058, + "step": 253 + }, + { + "epoch": 4.097297297297297, + "grad_norm": 0.23803076148033142, + "learning_rate": 1.8862862821480023e-06, + "loss": 0.0084, + "step": 254 + }, + { + "epoch": 4.113513513513514, + "grad_norm": 0.33866703510284424, + "learning_rate": 1.8577069984334522e-06, + "loss": 0.0099, + "step": 255 + }, + { + "epoch": 4.12972972972973, + "grad_norm": 0.35744938254356384, + "learning_rate": 1.829217305766289e-06, + "loss": 0.0064, + "step": 256 + }, + { + "epoch": 4.145945945945946, + "grad_norm": 0.2983056902885437, + "learning_rate": 1.8008211780671353e-06, + "loss": 0.0111, + "step": 257 + }, + { + "epoch": 4.162162162162162, + "grad_norm": 0.29860594868659973, + "learning_rate": 1.772522576205589e-06, + "loss": 0.0046, + "step": 258 + }, + { + "epoch": 4.178378378378379, + "grad_norm": 0.4390548765659332, + "learning_rate": 1.7443254474477328e-06, + "loss": 0.0105, + "step": 259 + }, + { + "epoch": 4.194594594594594, + "grad_norm": 0.34081289172172546, + "learning_rate": 1.7162337249055478e-06, + "loss": 0.0093, + "step": 260 + }, + { + "epoch": 4.210810810810811, + "grad_norm": 0.32730501890182495, + "learning_rate": 1.6882513269882916e-06, + "loss": 0.0084, + "step": 261 + }, + { + "epoch": 4.227027027027027, + "grad_norm": 0.7214042544364929, + "learning_rate": 1.6603821568559436e-06, + "loss": 0.0044, + "step": 262 + }, + { + "epoch": 4.243243243243243, + "grad_norm": 0.36692219972610474, + "learning_rate": 1.6326301018747623e-06, + "loss": 0.0065, + "step": 263 + }, + { + "epoch": 4.2594594594594595, + "grad_norm": 0.32951614260673523, + "learning_rate": 1.6049990330750508e-06, + "loss": 0.0055, + "step": 264 + }, + { + "epoch": 4.275675675675676, + "grad_norm": 0.33026283979415894, + "learning_rate": 1.5774928046112025e-06, + "loss": 0.0052, + "step": 265 + }, + { + "epoch": 4.291891891891892, + "grad_norm": 0.2922450006008148, + "learning_rate": 1.5501152532241003e-06, + "loss": 0.0071, + "step": 266 + }, + { + "epoch": 4.308108108108108, + "grad_norm": 0.37345102429389954, + "learning_rate": 1.522870197705943e-06, + "loss": 0.0067, + "step": 267 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 0.3973875343799591, + "learning_rate": 1.495761438367577e-06, + "loss": 0.0024, + "step": 268 + }, + { + "epoch": 4.34054054054054, + "grad_norm": 0.27959144115448, + "learning_rate": 1.4687927565084023e-06, + "loss": 0.0064, + "step": 269 + }, + { + "epoch": 4.356756756756757, + "grad_norm": 0.42801499366760254, + "learning_rate": 1.4419679138889379e-06, + "loss": 0.0074, + "step": 270 + }, + { + "epoch": 4.372972972972973, + "grad_norm": 0.5620250701904297, + "learning_rate": 1.415290652206105e-06, + "loss": 0.0085, + "step": 271 + }, + { + "epoch": 4.389189189189189, + "grad_norm": 0.4027979373931885, + "learning_rate": 1.3887646925713116e-06, + "loss": 0.0078, + "step": 272 + }, + { + "epoch": 4.405405405405405, + "grad_norm": 0.346365749835968, + "learning_rate": 1.3623937349914093e-06, + "loss": 0.0088, + "step": 273 + }, + { + "epoch": 4.421621621621622, + "grad_norm": 0.4886508584022522, + "learning_rate": 1.3361814578525922e-06, + "loss": 0.0058, + "step": 274 + }, + { + "epoch": 4.437837837837838, + "grad_norm": 0.33606165647506714, + "learning_rate": 1.3101315174073162e-06, + "loss": 0.0078, + "step": 275 + }, + { + "epoch": 4.454054054054054, + "grad_norm": 0.4041603207588196, + "learning_rate": 1.2842475472642969e-06, + "loss": 0.0037, + "step": 276 + }, + { + "epoch": 4.47027027027027, + "grad_norm": 0.9003347754478455, + "learning_rate": 1.258533157881674e-06, + "loss": 0.0054, + "step": 277 + }, + { + "epoch": 4.486486486486487, + "grad_norm": 0.5059202313423157, + "learning_rate": 1.2329919360634003e-06, + "loss": 0.0064, + "step": 278 + }, + { + "epoch": 4.5027027027027025, + "grad_norm": 0.4006575345993042, + "learning_rate": 1.2076274444589361e-06, + "loss": 0.0074, + "step": 279 + }, + { + "epoch": 4.518918918918919, + "grad_norm": 0.5821619033813477, + "learning_rate": 1.182443221066303e-06, + "loss": 0.0084, + "step": 280 + }, + { + "epoch": 4.535135135135135, + "grad_norm": 0.5989146828651428, + "learning_rate": 1.1574427787385853e-06, + "loss": 0.012, + "step": 281 + }, + { + "epoch": 4.551351351351351, + "grad_norm": 0.5475333333015442, + "learning_rate": 1.1326296046939334e-06, + "loss": 0.0035, + "step": 282 + }, + { + "epoch": 4.5675675675675675, + "grad_norm": 0.44198060035705566, + "learning_rate": 1.1080071600291453e-06, + "loss": 0.0055, + "step": 283 + }, + { + "epoch": 4.583783783783784, + "grad_norm": 1.231489658355713, + "learning_rate": 1.083578879236895e-06, + "loss": 0.0045, + "step": 284 + }, + { + "epoch": 4.6, + "grad_norm": 0.33816784620285034, + "learning_rate": 1.0593481697266582e-06, + "loss": 0.0076, + "step": 285 + }, + { + "epoch": 4.616216216216216, + "grad_norm": 0.5018823742866516, + "learning_rate": 1.0353184113494386e-06, + "loss": 0.0079, + "step": 286 + }, + { + "epoch": 4.632432432432433, + "grad_norm": 0.45690515637397766, + "learning_rate": 1.0114929559263122e-06, + "loss": 0.0061, + "step": 287 + }, + { + "epoch": 4.648648648648649, + "grad_norm": 0.4053182303905487, + "learning_rate": 9.878751267809069e-07, + "loss": 0.01, + "step": 288 + }, + { + "epoch": 4.664864864864865, + "grad_norm": 0.44401615858078003, + "learning_rate": 9.644682182758305e-07, + "loss": 0.0082, + "step": 289 + }, + { + "epoch": 4.681081081081081, + "grad_norm": 0.2947068214416504, + "learning_rate": 9.412754953531664e-07, + "loss": 0.0038, + "step": 290 + }, + { + "epoch": 4.697297297297297, + "grad_norm": 0.5023978352546692, + "learning_rate": 9.183001930790483e-07, + "loss": 0.0079, + "step": 291 + }, + { + "epoch": 4.713513513513513, + "grad_norm": 0.3027470111846924, + "learning_rate": 8.955455161924217e-07, + "loss": 0.0053, + "step": 292 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 0.4287405014038086, + "learning_rate": 8.730146386580157e-07, + "loss": 0.0059, + "step": 293 + }, + { + "epoch": 4.745945945945946, + "grad_norm": 0.292800635099411, + "learning_rate": 8.507107032236323e-07, + "loss": 0.0041, + "step": 294 + }, + { + "epoch": 4.762162162162162, + "grad_norm": 0.23428203165531158, + "learning_rate": 8.286368209817644e-07, + "loss": 0.009, + "step": 295 + }, + { + "epoch": 4.778378378378378, + "grad_norm": 0.48977068066596985, + "learning_rate": 8.067960709356479e-07, + "loss": 0.0054, + "step": 296 + }, + { + "epoch": 4.794594594594595, + "grad_norm": 0.46532517671585083, + "learning_rate": 7.851914995697801e-07, + "loss": 0.0056, + "step": 297 + }, + { + "epoch": 4.8108108108108105, + "grad_norm": 0.3898661732673645, + "learning_rate": 7.638261204249783e-07, + "loss": 0.0052, + "step": 298 + }, + { + "epoch": 4.827027027027027, + "grad_norm": 0.29227131605148315, + "learning_rate": 7.427029136780333e-07, + "loss": 0.0033, + "step": 299 + }, + { + "epoch": 4.8432432432432435, + "grad_norm": 0.36053887009620667, + "learning_rate": 7.218248257260127e-07, + "loss": 0.0128, + "step": 300 + }, + { + "epoch": 4.859459459459459, + "grad_norm": 0.4008624851703644, + "learning_rate": 7.011947687752804e-07, + "loss": 0.0106, + "step": 301 + }, + { + "epoch": 4.875675675675676, + "grad_norm": 0.5273243188858032, + "learning_rate": 6.808156204352845e-07, + "loss": 0.0081, + "step": 302 + }, + { + "epoch": 4.891891891891892, + "grad_norm": 0.328396201133728, + "learning_rate": 6.60690223317171e-07, + "loss": 0.0054, + "step": 303 + }, + { + "epoch": 4.908108108108108, + "grad_norm": 0.3451617360115051, + "learning_rate": 6.40821384637276e-07, + "loss": 0.007, + "step": 304 + }, + { + "epoch": 4.924324324324324, + "grad_norm": 0.4219134449958801, + "learning_rate": 6.212118758255595e-07, + "loss": 0.0099, + "step": 305 + }, + { + "epoch": 4.940540540540541, + "grad_norm": 0.5635795593261719, + "learning_rate": 6.018644321390288e-07, + "loss": 0.0061, + "step": 306 + }, + { + "epoch": 4.956756756756757, + "grad_norm": 0.18451224267482758, + "learning_rate": 5.827817522802065e-07, + "loss": 0.004, + "step": 307 + }, + { + "epoch": 4.972972972972973, + "grad_norm": 0.6001662015914917, + "learning_rate": 5.639664980207024e-07, + "loss": 0.0061, + "step": 308 + }, + { + "epoch": 4.989189189189189, + "grad_norm": 0.43228599429130554, + "learning_rate": 5.454212938299256e-07, + "loss": 0.0089, + "step": 309 + }, + { + "epoch": 5.0, + "grad_norm": 0.805902898311615, + "learning_rate": 5.271487265090163e-07, + "loss": 0.0051, + "step": 310 + }, + { + "epoch": 5.0162162162162165, + "grad_norm": 0.3951954245567322, + "learning_rate": 5.091513448300142e-07, + "loss": 0.0076, + "step": 311 + }, + { + "epoch": 5.032432432432432, + "grad_norm": 0.2181919664144516, + "learning_rate": 4.914316591803475e-07, + "loss": 0.004, + "step": 312 + }, + { + "epoch": 5.048648648648649, + "grad_norm": 0.34473466873168945, + "learning_rate": 4.739921412126591e-07, + "loss": 0.0085, + "step": 313 + }, + { + "epoch": 5.064864864864865, + "grad_norm": 0.29619839787483215, + "learning_rate": 4.5683522350005505e-07, + "loss": 0.0051, + "step": 314 + }, + { + "epoch": 5.081081081081081, + "grad_norm": 0.20610685646533966, + "learning_rate": 4.399632991967867e-07, + "loss": 0.0068, + "step": 315 + }, + { + "epoch": 5.097297297297297, + "grad_norm": 0.43793216347694397, + "learning_rate": 4.23378721704443e-07, + "loss": 0.005, + "step": 316 + }, + { + "epoch": 5.113513513513514, + "grad_norm": 0.26209649443626404, + "learning_rate": 4.070838043436787e-07, + "loss": 0.0053, + "step": 317 + }, + { + "epoch": 5.12972972972973, + "grad_norm": 0.3751225173473358, + "learning_rate": 3.910808200315433e-07, + "loss": 0.0038, + "step": 318 + }, + { + "epoch": 5.145945945945946, + "grad_norm": 0.2594164311885834, + "learning_rate": 3.753720009644371e-07, + "loss": 0.0039, + "step": 319 + }, + { + "epoch": 5.162162162162162, + "grad_norm": 0.2959458529949188, + "learning_rate": 3.5995953830675004e-07, + "loss": 0.0068, + "step": 320 + }, + { + "epoch": 5.178378378378379, + "grad_norm": 0.29593151807785034, + "learning_rate": 3.448455818852267e-07, + "loss": 0.0049, + "step": 321 + }, + { + "epoch": 5.194594594594594, + "grad_norm": 0.26220938563346863, + "learning_rate": 3.3003223988909234e-07, + "loss": 0.003, + "step": 322 + }, + { + "epoch": 5.210810810810811, + "grad_norm": 0.27080225944519043, + "learning_rate": 3.1552157857599327e-07, + "loss": 0.0054, + "step": 323 + }, + { + "epoch": 5.227027027027027, + "grad_norm": 0.32697442173957825, + "learning_rate": 3.0131562198377763e-07, + "loss": 0.0059, + "step": 324 + }, + { + "epoch": 5.243243243243243, + "grad_norm": 0.16209611296653748, + "learning_rate": 2.874163516481732e-07, + "loss": 0.005, + "step": 325 + }, + { + "epoch": 5.2594594594594595, + "grad_norm": 0.3344082534313202, + "learning_rate": 2.7382570632638853e-07, + "loss": 0.0048, + "step": 326 + }, + { + "epoch": 5.275675675675676, + "grad_norm": 0.3399072587490082, + "learning_rate": 2.605455817266861e-07, + "loss": 0.0056, + "step": 327 + }, + { + "epoch": 5.291891891891892, + "grad_norm": 0.25053179264068604, + "learning_rate": 2.4757783024395244e-07, + "loss": 0.004, + "step": 328 + }, + { + "epoch": 5.308108108108108, + "grad_norm": 0.2717658281326294, + "learning_rate": 2.3492426070131746e-07, + "loss": 0.0039, + "step": 329 + }, + { + "epoch": 5.324324324324325, + "grad_norm": 0.38859960436820984, + "learning_rate": 2.2258663809784892e-07, + "loss": 0.0051, + "step": 330 + }, + { + "epoch": 5.34054054054054, + "grad_norm": 0.2572256922721863, + "learning_rate": 2.1056668336235624e-07, + "loss": 0.0049, + "step": 331 + }, + { + "epoch": 5.356756756756757, + "grad_norm": 0.2547106146812439, + "learning_rate": 1.9886607311334987e-07, + "loss": 0.0022, + "step": 332 + }, + { + "epoch": 5.372972972972973, + "grad_norm": 0.29432207345962524, + "learning_rate": 1.8748643942516882e-07, + "loss": 0.0056, + "step": 333 + }, + { + "epoch": 5.389189189189189, + "grad_norm": 0.38799527287483215, + "learning_rate": 1.764293696003358e-07, + "loss": 0.0051, + "step": 334 + }, + { + "epoch": 5.405405405405405, + "grad_norm": 0.2726413905620575, + "learning_rate": 1.656964059481453e-07, + "loss": 0.0029, + "step": 335 + }, + { + "epoch": 5.421621621621622, + "grad_norm": 0.5010594725608826, + "learning_rate": 1.552890455695369e-07, + "loss": 0.0062, + "step": 336 + }, + { + "epoch": 5.437837837837838, + "grad_norm": 0.3048969507217407, + "learning_rate": 1.4520874014826464e-07, + "loss": 0.0038, + "step": 337 + }, + { + "epoch": 5.454054054054054, + "grad_norm": 0.49177053570747375, + "learning_rate": 1.3545689574841341e-07, + "loss": 0.0078, + "step": 338 + }, + { + "epoch": 5.47027027027027, + "grad_norm": 0.6290957927703857, + "learning_rate": 1.2603487261826726e-07, + "loss": 0.0061, + "step": 339 + }, + { + "epoch": 5.486486486486487, + "grad_norm": 0.40990152955055237, + "learning_rate": 1.1694398500057714e-07, + "loss": 0.0043, + "step": 340 + }, + { + "epoch": 5.5027027027027025, + "grad_norm": 0.5539492964744568, + "learning_rate": 1.081855009492383e-07, + "loss": 0.0056, + "step": 341 + }, + { + "epoch": 5.518918918918919, + "grad_norm": 0.3589450418949127, + "learning_rate": 9.976064215241859e-08, + "loss": 0.0039, + "step": 342 + }, + { + "epoch": 5.535135135135135, + "grad_norm": 0.2895784080028534, + "learning_rate": 9.167058376214621e-08, + "loss": 0.0037, + "step": 343 + }, + { + "epoch": 5.551351351351351, + "grad_norm": 0.290560781955719, + "learning_rate": 8.391645423039357e-08, + "loss": 0.0057, + "step": 344 + }, + { + "epoch": 5.5675675675675675, + "grad_norm": 0.26922520995140076, + "learning_rate": 7.649933515167407e-08, + "loss": 0.0051, + "step": 345 + }, + { + "epoch": 5.583783783783784, + "grad_norm": 0.2856527864933014, + "learning_rate": 6.94202611121736e-08, + "loss": 0.0048, + "step": 346 + }, + { + "epoch": 5.6, + "grad_norm": 0.34312814474105835, + "learning_rate": 6.268021954544095e-08, + "loss": 0.002, + "step": 347 + }, + { + "epoch": 5.616216216216216, + "grad_norm": 0.1839967519044876, + "learning_rate": 5.628015059465364e-08, + "loss": 0.0054, + "step": 348 + }, + { + "epoch": 5.632432432432433, + "grad_norm": 0.43173515796661377, + "learning_rate": 5.022094698148072e-08, + "loss": 0.0063, + "step": 349 + }, + { + "epoch": 5.648648648648649, + "grad_norm": 0.33626168966293335, + "learning_rate": 4.450345388156141e-08, + "loss": 0.008, + "step": 350 + }, + { + "epoch": 5.664864864864865, + "grad_norm": 0.31997352838516235, + "learning_rate": 3.9128468806614304e-08, + "loss": 0.0052, + "step": 351 + }, + { + "epoch": 5.681081081081081, + "grad_norm": 0.2784101068973541, + "learning_rate": 3.4096741493194196e-08, + "loss": 0.0036, + "step": 352 + }, + { + "epoch": 5.697297297297297, + "grad_norm": 0.37963977456092834, + "learning_rate": 2.940897379811597e-08, + "loss": 0.0056, + "step": 353 + }, + { + "epoch": 5.713513513513513, + "grad_norm": 0.25286877155303955, + "learning_rate": 2.506581960055432e-08, + "loss": 0.0044, + "step": 354 + }, + { + "epoch": 5.72972972972973, + "grad_norm": 0.3452949821949005, + "learning_rate": 2.106788471083615e-08, + "loss": 0.0065, + "step": 355 + }, + { + "epoch": 5.745945945945946, + "grad_norm": 0.2761637270450592, + "learning_rate": 1.7415726785939836e-08, + "loss": 0.006, + "step": 356 + }, + { + "epoch": 5.762162162162162, + "grad_norm": 0.24352076649665833, + "learning_rate": 1.4109855251708272e-08, + "loss": 0.0037, + "step": 357 + }, + { + "epoch": 5.778378378378378, + "grad_norm": 0.4338150918483734, + "learning_rate": 1.115073123179128e-08, + "loss": 0.0061, + "step": 358 + }, + { + "epoch": 5.794594594594595, + "grad_norm": 0.36566513776779175, + "learning_rate": 8.538767483325384e-09, + "loss": 0.0043, + "step": 359 + }, + { + "epoch": 5.8108108108108105, + "grad_norm": 0.21839426457881927, + "learning_rate": 6.274328339360702e-09, + "loss": 0.0039, + "step": 360 + }, + { + "epoch": 5.827027027027027, + "grad_norm": 0.30049052834510803, + "learning_rate": 4.357729658039378e-09, + "loss": 0.0032, + "step": 361 + }, + { + "epoch": 5.8432432432432435, + "grad_norm": 0.5614826083183289, + "learning_rate": 2.789238778540537e-09, + "loss": 0.0048, + "step": 362 + }, + { + "epoch": 5.859459459459459, + "grad_norm": 0.23777024447917938, + "learning_rate": 1.5690744837873473e-09, + "loss": 0.0037, + "step": 363 + }, + { + "epoch": 5.875675675675676, + "grad_norm": 0.2646014094352722, + "learning_rate": 6.974069699314246e-10, + "loss": 0.0041, + "step": 364 + }, + { + "epoch": 5.891891891891892, + "grad_norm": 0.2773268222808838, + "learning_rate": 1.743578226129361e-10, + "loss": 0.0039, + "step": 365 + }, + { + "epoch": 5.908108108108108, + "grad_norm": 0.23373937606811523, + "learning_rate": 0.0, + "loss": 0.0042, + "step": 366 + } + ], + "logging_steps": 1, + "max_steps": 366, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 61, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.182143392723763e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-366/training_args.bin b/checkpoint-366/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5b28d107f55169977eced33ac6929abb398bb2c5 --- /dev/null +++ b/checkpoint-366/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e2f1aaf0f48ae52048eea3703205522237e597bd418f53d57d152ef3ad9cbbc +size 8056 diff --git a/checkpoint-366/zero_to_fp32.py b/checkpoint-366/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/checkpoint-366/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters)