diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f235b88ff7f73df087c733ca5a1d0771254e22e0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,146 @@
+---
+library_name: peft
+license: llama3.1
+base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
+tags:
+- generated_from_trainer
+datasets:
+- ugaoo/multimedqa_wrongnemotron70
+model-index:
+- name: out/multimedqa_wrongnemotron70
+ results: []
+---
+
+
+
+[
](https://github.com/axolotl-ai-cloud/axolotl)
+See axolotl config
+
+axolotl version: `0.8.0.dev0`
+```yaml
+base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+ - path: ugaoo/multimedqa_wrongnemotron70
+ type: alpaca
+val_set_size: 0
+output_dir: ./out/multimedqa_wrongnemotron70
+
+sequence_len: 4000
+sample_packing: true
+pad_to_sequence_len: true
+
+adapter: qlora
+lora_r: 256
+lora_alpha: 512
+lora_dropout: 0.05
+lora_target_linear: true
+lora_target_modules:
+ - q_proj
+ - k_proj
+ - v_proj
+ - o_proj
+ - up_proj
+ - down_proj
+ - gate_proj
+lora_modules_to_save:
+ - embed_tokens
+ - lm_head
+
+wandb_project: cosmosearch
+wandb_entity:
+wandb_watch:
+wandb_name: multimedqa_wrongnemotron70_Nemotron-70B
+wandb_log_model:
+
+gradient_accumulation_steps: 3
+micro_batch_size: 4
+num_epochs: 6
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 5e-6
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 100
+evals_per_epoch: 6
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+save_total_limit: 6
+special_tokens:
+ pad_token: <|end_of_text|>
+
+```
+
+
+
+# out/multimedqa_wrongnemotron70
+
+This model is a fine-tuned version of [nvidia/Llama-3.1-Nemotron-70B-Instruct-HF](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) on the ugaoo/multimedqa_wrongnemotron70 dataset.
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 5e-06
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 3
+- gradient_accumulation_steps: 3
+- total_train_batch_size: 36
+- total_eval_batch_size: 12
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 100
+- num_epochs: 6.0
+
+### Training results
+
+
+
+### Framework versions
+
+- PEFT 0.15.0
+- Transformers 4.49.0
+- Pytorch 2.5.1+cu124
+- Datasets 3.4.1
+- Tokenizers 0.21.1
\ No newline at end of file
diff --git a/adapter_config.json b/adapter_config.json
index e80b3379560eb1336c609e937ceaed982cf3933f..3abb5d68d20446d2b99ace226d6233a68590205a 100644
--- a/adapter_config.json
+++ b/adapter_config.json
@@ -1,13 +1,13 @@
{
"alpha_pattern": {},
"auto_mapping": null,
- "base_model_name_or_path": "meta-llama/Llama-3.1-70B-Instruct",
+ "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
"bias": "none",
"corda_config": null,
"eva_config": null,
"exclude_modules": null,
"fan_in_fan_out": null,
- "inference_mode": false,
+ "inference_mode": true,
"init_lora_weights": true,
"layer_replication": null,
"layers_pattern": null,
@@ -27,13 +27,13 @@
"rank_pattern": {},
"revision": null,
"target_modules": [
- "k_proj",
- "q_proj",
- "down_proj",
"o_proj",
+ "up_proj",
+ "down_proj",
"gate_proj",
"v_proj",
- "up_proj"
+ "k_proj",
+ "q_proj"
],
"task_type": "CAUSAL_LM",
"trainable_token_indices": null,
diff --git a/adapter_model.safetensors b/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..687651e596b08c7a9d61272bbcf8fa414e20c651
--- /dev/null
+++ b/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d972b408cb0b67bf1dc53652467b5e0918debd7a39c06db3c21ed66e350d48d
+size 10829849744
diff --git a/checkpoint-144/README.md b/checkpoint-144/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fdf619c317c2fe82074662582dbd68166b6f9d50
--- /dev/null
+++ b/checkpoint-144/README.md
@@ -0,0 +1,202 @@
+---
+base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-144/adapter_config.json b/checkpoint-144/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3abb5d68d20446d2b99ace226d6233a68590205a
--- /dev/null
+++ b/checkpoint-144/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "up_proj",
+ "down_proj",
+ "gate_proj",
+ "v_proj",
+ "k_proj",
+ "q_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-144/adapter_model.safetensors b/checkpoint-144/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c48e73dc56ffc1e362c24ef12788f9cecc534d8c
--- /dev/null
+++ b/checkpoint-144/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f663d610d080456f9cbc8eeb182c48be8a89be133c8d590eaf93d04e80d25e1
+size 10829849744
diff --git a/checkpoint-144/global_step143/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-144/global_step143/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7020022b9071ebc847191ea872ba6507e207f440
--- /dev/null
+++ b/checkpoint-144/global_step143/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c1036ef9f20f5a38378195a6783b47b5e9a5e41196556ef59d1d0ff09256f10
+size 21659418140
diff --git a/checkpoint-144/global_step143/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-144/global_step143/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e35f608799c553e1cbaba4bee2e7b4460599ce66
--- /dev/null
+++ b/checkpoint-144/global_step143/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a45cacd90984918f7d85e007c9d73877ee14b68ec5e17cc43f250325372966d
+size 21659457372
diff --git a/checkpoint-144/global_step143/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-144/global_step143/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..000a8c8f7edbbbe840db0c4aa7507f1270634644
--- /dev/null
+++ b/checkpoint-144/global_step143/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:614a95a7c4e7a25c606de33dda22d69076a90f9dcb19100099421ceb20f5d8a8
+size 21659417820
diff --git a/checkpoint-144/global_step143/mp_rank_00_model_states.pt b/checkpoint-144/global_step143/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..79de1b2dfec0d3950f5e73e8dd31111d70d21810
--- /dev/null
+++ b/checkpoint-144/global_step143/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd8fd06237d78218e435a97b67ec37dfdbc486f4eeb5c9f4625531d27592ea31
+size 11918643933
diff --git a/checkpoint-144/latest b/checkpoint-144/latest
new file mode 100644
index 0000000000000000000000000000000000000000..93407f5a9fdef065b428ddd4b9440e88eb65a982
--- /dev/null
+++ b/checkpoint-144/latest
@@ -0,0 +1 @@
+global_step143
\ No newline at end of file
diff --git a/checkpoint-144/rng_state_0.pth b/checkpoint-144/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..64b63e44f4cfdd29a0ce453bd6c6ce36968570bc
--- /dev/null
+++ b/checkpoint-144/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52cfa88e96395a9f554de4f79c4baba7be8d9940fe5b00d6c840fc070c9e0871
+size 14768
diff --git a/checkpoint-144/rng_state_1.pth b/checkpoint-144/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d0c1d24cbdbe1b77b5ac7d8dfa649c9cc6d80415
--- /dev/null
+++ b/checkpoint-144/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f675af0d887993afac3122873bcddaa14afdea8fb3ff46a0ea096b2acca2bc0f
+size 14768
diff --git a/checkpoint-144/rng_state_2.pth b/checkpoint-144/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9bae09d9af3d65c424cb5cf304472803673786f0
--- /dev/null
+++ b/checkpoint-144/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f112bdcf15570488162c8646b9e1e3c10f4135c8f174cc0118c1172493350e4e
+size 14768
diff --git a/checkpoint-144/scheduler.pt b/checkpoint-144/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e2dde2b518bcd4fd63433ef1bc0e8e93df9dd450
--- /dev/null
+++ b/checkpoint-144/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3c70fd0b9904cd462c44b91c3a6304d50f808b742c89f710f481983baa6497c
+size 1064
diff --git a/checkpoint-144/special_tokens_map.json b/checkpoint-144/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-144/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-144/tokenizer.json b/checkpoint-144/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-144/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-144/tokenizer_config.json b/checkpoint-144/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdde94c29816839ec3c29d6c6461206a49911f3c
--- /dev/null
+++ b/checkpoint-144/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-144/trainer_state.json b/checkpoint-144/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a34f5c35752afcf85aa843964bdac9394bd9765
--- /dev/null
+++ b/checkpoint-144/trainer_state.json
@@ -0,0 +1,1041 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.9815668202764978,
+ "eval_steps": 500,
+ "global_step": 144,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.013824884792626729,
+ "grad_norm": 34.963134765625,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 2.5476,
+ "step": 1
+ },
+ {
+ "epoch": 0.027649769585253458,
+ "grad_norm": 35.32600021362305,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 2.6058,
+ "step": 2
+ },
+ {
+ "epoch": 0.041474654377880185,
+ "grad_norm": 34.955448150634766,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 2.5871,
+ "step": 3
+ },
+ {
+ "epoch": 0.055299539170506916,
+ "grad_norm": 35.09806442260742,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 2.5912,
+ "step": 4
+ },
+ {
+ "epoch": 0.06912442396313365,
+ "grad_norm": 34.88739776611328,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 2.592,
+ "step": 5
+ },
+ {
+ "epoch": 0.08294930875576037,
+ "grad_norm": 34.84288024902344,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 2.5609,
+ "step": 6
+ },
+ {
+ "epoch": 0.0967741935483871,
+ "grad_norm": 35.0090217590332,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 2.5651,
+ "step": 7
+ },
+ {
+ "epoch": 0.11059907834101383,
+ "grad_norm": 35.03983688354492,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 2.5437,
+ "step": 8
+ },
+ {
+ "epoch": 0.12442396313364056,
+ "grad_norm": 34.802833557128906,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 2.5448,
+ "step": 9
+ },
+ {
+ "epoch": 0.1382488479262673,
+ "grad_norm": 34.5220947265625,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 2.504,
+ "step": 10
+ },
+ {
+ "epoch": 0.15207373271889402,
+ "grad_norm": 34.401580810546875,
+ "learning_rate": 5.5e-07,
+ "loss": 2.4814,
+ "step": 11
+ },
+ {
+ "epoch": 0.16589861751152074,
+ "grad_norm": 33.76997375488281,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 2.4282,
+ "step": 12
+ },
+ {
+ "epoch": 0.17972350230414746,
+ "grad_norm": 33.53415298461914,
+ "learning_rate": 6.5e-07,
+ "loss": 2.4216,
+ "step": 13
+ },
+ {
+ "epoch": 0.1935483870967742,
+ "grad_norm": 32.401580810546875,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 2.3362,
+ "step": 14
+ },
+ {
+ "epoch": 0.2073732718894009,
+ "grad_norm": 33.636661529541016,
+ "learning_rate": 7.5e-07,
+ "loss": 2.2978,
+ "step": 15
+ },
+ {
+ "epoch": 0.22119815668202766,
+ "grad_norm": 31.3782901763916,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 2.1358,
+ "step": 16
+ },
+ {
+ "epoch": 0.2350230414746544,
+ "grad_norm": 30.72391700744629,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 2.0652,
+ "step": 17
+ },
+ {
+ "epoch": 0.2488479262672811,
+ "grad_norm": 30.817584991455078,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 2.0115,
+ "step": 18
+ },
+ {
+ "epoch": 0.2626728110599078,
+ "grad_norm": 29.683996200561523,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 1.8668,
+ "step": 19
+ },
+ {
+ "epoch": 0.2764976958525346,
+ "grad_norm": 29.506683349609375,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.7796,
+ "step": 20
+ },
+ {
+ "epoch": 0.2903225806451613,
+ "grad_norm": 27.55340003967285,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 1.5656,
+ "step": 21
+ },
+ {
+ "epoch": 0.30414746543778803,
+ "grad_norm": 27.78036880493164,
+ "learning_rate": 1.1e-06,
+ "loss": 1.5112,
+ "step": 22
+ },
+ {
+ "epoch": 0.31797235023041476,
+ "grad_norm": 26.36115264892578,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 1.3283,
+ "step": 23
+ },
+ {
+ "epoch": 0.3317972350230415,
+ "grad_norm": 25.388761520385742,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 1.137,
+ "step": 24
+ },
+ {
+ "epoch": 0.3456221198156682,
+ "grad_norm": 25.21432876586914,
+ "learning_rate": 1.25e-06,
+ "loss": 0.9867,
+ "step": 25
+ },
+ {
+ "epoch": 0.35944700460829493,
+ "grad_norm": 24.924489974975586,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7122,
+ "step": 26
+ },
+ {
+ "epoch": 0.37327188940092165,
+ "grad_norm": 21.881420135498047,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.4952,
+ "step": 27
+ },
+ {
+ "epoch": 0.3870967741935484,
+ "grad_norm": 17.67154884338379,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.3602,
+ "step": 28
+ },
+ {
+ "epoch": 0.4009216589861751,
+ "grad_norm": 11.489490509033203,
+ "learning_rate": 1.45e-06,
+ "loss": 0.2432,
+ "step": 29
+ },
+ {
+ "epoch": 0.4147465437788018,
+ "grad_norm": 7.622438907623291,
+ "learning_rate": 1.5e-06,
+ "loss": 0.189,
+ "step": 30
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 4.340638637542725,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.1302,
+ "step": 31
+ },
+ {
+ "epoch": 0.4423963133640553,
+ "grad_norm": 3.079514980316162,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.1075,
+ "step": 32
+ },
+ {
+ "epoch": 0.45622119815668205,
+ "grad_norm": 2.355943441390991,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.0998,
+ "step": 33
+ },
+ {
+ "epoch": 0.4700460829493088,
+ "grad_norm": 1.9480725526809692,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.0926,
+ "step": 34
+ },
+ {
+ "epoch": 0.4838709677419355,
+ "grad_norm": 1.8598166704177856,
+ "learning_rate": 1.75e-06,
+ "loss": 0.0733,
+ "step": 35
+ },
+ {
+ "epoch": 0.4976958525345622,
+ "grad_norm": 0.9892730712890625,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.0664,
+ "step": 36
+ },
+ {
+ "epoch": 0.511520737327189,
+ "grad_norm": 0.8992418050765991,
+ "learning_rate": 1.85e-06,
+ "loss": 0.0709,
+ "step": 37
+ },
+ {
+ "epoch": 0.5253456221198156,
+ "grad_norm": 0.7340101599693298,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.0535,
+ "step": 38
+ },
+ {
+ "epoch": 0.5391705069124424,
+ "grad_norm": 0.7032178044319153,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.0573,
+ "step": 39
+ },
+ {
+ "epoch": 0.5529953917050692,
+ "grad_norm": 0.6449429392814636,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.0576,
+ "step": 40
+ },
+ {
+ "epoch": 0.5668202764976958,
+ "grad_norm": 0.6358592510223389,
+ "learning_rate": 2.05e-06,
+ "loss": 0.0502,
+ "step": 41
+ },
+ {
+ "epoch": 0.5806451612903226,
+ "grad_norm": 0.572036623954773,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.0556,
+ "step": 42
+ },
+ {
+ "epoch": 0.5944700460829493,
+ "grad_norm": 0.6538863778114319,
+ "learning_rate": 2.15e-06,
+ "loss": 0.0556,
+ "step": 43
+ },
+ {
+ "epoch": 0.6082949308755761,
+ "grad_norm": 0.3532159626483917,
+ "learning_rate": 2.2e-06,
+ "loss": 0.0452,
+ "step": 44
+ },
+ {
+ "epoch": 0.6221198156682027,
+ "grad_norm": 0.4853012263774872,
+ "learning_rate": 2.25e-06,
+ "loss": 0.0471,
+ "step": 45
+ },
+ {
+ "epoch": 0.6359447004608295,
+ "grad_norm": 0.4761648178100586,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.0469,
+ "step": 46
+ },
+ {
+ "epoch": 0.6497695852534562,
+ "grad_norm": 0.6094638109207153,
+ "learning_rate": 2.35e-06,
+ "loss": 0.047,
+ "step": 47
+ },
+ {
+ "epoch": 0.663594470046083,
+ "grad_norm": 0.5211306214332581,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.0402,
+ "step": 48
+ },
+ {
+ "epoch": 0.6774193548387096,
+ "grad_norm": 0.2997778356075287,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.0425,
+ "step": 49
+ },
+ {
+ "epoch": 0.6912442396313364,
+ "grad_norm": 0.37834689021110535,
+ "learning_rate": 2.5e-06,
+ "loss": 0.0446,
+ "step": 50
+ },
+ {
+ "epoch": 0.7050691244239631,
+ "grad_norm": 0.31011995673179626,
+ "learning_rate": 2.55e-06,
+ "loss": 0.0406,
+ "step": 51
+ },
+ {
+ "epoch": 0.7188940092165899,
+ "grad_norm": 0.3113131523132324,
+ "learning_rate": 2.6e-06,
+ "loss": 0.0368,
+ "step": 52
+ },
+ {
+ "epoch": 0.7327188940092166,
+ "grad_norm": 0.5685846209526062,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.0389,
+ "step": 53
+ },
+ {
+ "epoch": 0.7465437788018433,
+ "grad_norm": 0.29334983229637146,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.0423,
+ "step": 54
+ },
+ {
+ "epoch": 0.7603686635944701,
+ "grad_norm": 0.5776861906051636,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.0399,
+ "step": 55
+ },
+ {
+ "epoch": 0.7741935483870968,
+ "grad_norm": 0.35423165559768677,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.0357,
+ "step": 56
+ },
+ {
+ "epoch": 0.7880184331797235,
+ "grad_norm": 0.37902742624282837,
+ "learning_rate": 2.85e-06,
+ "loss": 0.0407,
+ "step": 57
+ },
+ {
+ "epoch": 0.8018433179723502,
+ "grad_norm": 0.26948878169059753,
+ "learning_rate": 2.9e-06,
+ "loss": 0.0351,
+ "step": 58
+ },
+ {
+ "epoch": 0.815668202764977,
+ "grad_norm": 0.35688117146492004,
+ "learning_rate": 2.95e-06,
+ "loss": 0.0377,
+ "step": 59
+ },
+ {
+ "epoch": 0.8294930875576036,
+ "grad_norm": 0.5287911891937256,
+ "learning_rate": 3e-06,
+ "loss": 0.0377,
+ "step": 60
+ },
+ {
+ "epoch": 0.8433179723502304,
+ "grad_norm": 0.2950785756111145,
+ "learning_rate": 3.05e-06,
+ "loss": 0.0361,
+ "step": 61
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 0.2789723575115204,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.032,
+ "step": 62
+ },
+ {
+ "epoch": 0.8709677419354839,
+ "grad_norm": 0.2802198529243469,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.0394,
+ "step": 63
+ },
+ {
+ "epoch": 0.8847926267281107,
+ "grad_norm": 0.286981463432312,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.033,
+ "step": 64
+ },
+ {
+ "epoch": 0.8986175115207373,
+ "grad_norm": 0.37392762303352356,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.0335,
+ "step": 65
+ },
+ {
+ "epoch": 0.9124423963133641,
+ "grad_norm": 0.25025588274002075,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.0311,
+ "step": 66
+ },
+ {
+ "epoch": 0.9262672811059908,
+ "grad_norm": 0.4292861521244049,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.0362,
+ "step": 67
+ },
+ {
+ "epoch": 0.9400921658986175,
+ "grad_norm": 0.4717651307582855,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.0303,
+ "step": 68
+ },
+ {
+ "epoch": 0.9539170506912442,
+ "grad_norm": 0.49291253089904785,
+ "learning_rate": 3.45e-06,
+ "loss": 0.0352,
+ "step": 69
+ },
+ {
+ "epoch": 0.967741935483871,
+ "grad_norm": 0.3729935586452484,
+ "learning_rate": 3.5e-06,
+ "loss": 0.0297,
+ "step": 70
+ },
+ {
+ "epoch": 0.9815668202764977,
+ "grad_norm": 0.27150583267211914,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.0326,
+ "step": 71
+ },
+ {
+ "epoch": 0.9953917050691244,
+ "grad_norm": 0.34516096115112305,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.0336,
+ "step": 72
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 0.34516096115112305,
+ "learning_rate": 3.65e-06,
+ "loss": 0.0274,
+ "step": 73
+ },
+ {
+ "epoch": 1.0138248847926268,
+ "grad_norm": 0.6282734870910645,
+ "learning_rate": 3.7e-06,
+ "loss": 0.0289,
+ "step": 74
+ },
+ {
+ "epoch": 1.0276497695852536,
+ "grad_norm": 0.2935558557510376,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.0308,
+ "step": 75
+ },
+ {
+ "epoch": 1.0414746543778801,
+ "grad_norm": 0.3166769742965698,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.0277,
+ "step": 76
+ },
+ {
+ "epoch": 1.055299539170507,
+ "grad_norm": 0.38190239667892456,
+ "learning_rate": 3.85e-06,
+ "loss": 0.0338,
+ "step": 77
+ },
+ {
+ "epoch": 1.0691244239631337,
+ "grad_norm": 0.2779421806335449,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.03,
+ "step": 78
+ },
+ {
+ "epoch": 1.0829493087557605,
+ "grad_norm": 0.4055996537208557,
+ "learning_rate": 3.95e-06,
+ "loss": 0.0295,
+ "step": 79
+ },
+ {
+ "epoch": 1.096774193548387,
+ "grad_norm": 0.2987312972545624,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.028,
+ "step": 80
+ },
+ {
+ "epoch": 1.1105990783410138,
+ "grad_norm": 0.2674776017665863,
+ "learning_rate": 4.05e-06,
+ "loss": 0.0243,
+ "step": 81
+ },
+ {
+ "epoch": 1.1244239631336406,
+ "grad_norm": 0.29042816162109375,
+ "learning_rate": 4.1e-06,
+ "loss": 0.0318,
+ "step": 82
+ },
+ {
+ "epoch": 1.1382488479262673,
+ "grad_norm": 0.2904883027076721,
+ "learning_rate": 4.15e-06,
+ "loss": 0.0257,
+ "step": 83
+ },
+ {
+ "epoch": 1.1520737327188941,
+ "grad_norm": 0.30603015422821045,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.0284,
+ "step": 84
+ },
+ {
+ "epoch": 1.1658986175115207,
+ "grad_norm": 0.23131045699119568,
+ "learning_rate": 4.25e-06,
+ "loss": 0.0285,
+ "step": 85
+ },
+ {
+ "epoch": 1.1797235023041475,
+ "grad_norm": 0.26788002252578735,
+ "learning_rate": 4.3e-06,
+ "loss": 0.0269,
+ "step": 86
+ },
+ {
+ "epoch": 1.1935483870967742,
+ "grad_norm": 0.2639651894569397,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.0289,
+ "step": 87
+ },
+ {
+ "epoch": 1.2073732718894008,
+ "grad_norm": 0.25068584084510803,
+ "learning_rate": 4.4e-06,
+ "loss": 0.0275,
+ "step": 88
+ },
+ {
+ "epoch": 1.2211981566820276,
+ "grad_norm": 0.25494542717933655,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.0275,
+ "step": 89
+ },
+ {
+ "epoch": 1.2350230414746544,
+ "grad_norm": 0.31125035881996155,
+ "learning_rate": 4.5e-06,
+ "loss": 0.0251,
+ "step": 90
+ },
+ {
+ "epoch": 1.2488479262672811,
+ "grad_norm": 0.2691773474216461,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.0267,
+ "step": 91
+ },
+ {
+ "epoch": 1.262672811059908,
+ "grad_norm": 0.20079147815704346,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.0263,
+ "step": 92
+ },
+ {
+ "epoch": 1.2764976958525347,
+ "grad_norm": 0.28027331829071045,
+ "learning_rate": 4.65e-06,
+ "loss": 0.0227,
+ "step": 93
+ },
+ {
+ "epoch": 1.2903225806451613,
+ "grad_norm": 0.40053099393844604,
+ "learning_rate": 4.7e-06,
+ "loss": 0.0246,
+ "step": 94
+ },
+ {
+ "epoch": 1.304147465437788,
+ "grad_norm": 0.33066362142562866,
+ "learning_rate": 4.75e-06,
+ "loss": 0.0221,
+ "step": 95
+ },
+ {
+ "epoch": 1.3179723502304148,
+ "grad_norm": 0.2531339228153229,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.0216,
+ "step": 96
+ },
+ {
+ "epoch": 1.3317972350230414,
+ "grad_norm": 0.37544378638267517,
+ "learning_rate": 4.85e-06,
+ "loss": 0.0247,
+ "step": 97
+ },
+ {
+ "epoch": 1.3456221198156681,
+ "grad_norm": 0.34273672103881836,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.0217,
+ "step": 98
+ },
+ {
+ "epoch": 1.359447004608295,
+ "grad_norm": 0.2338661253452301,
+ "learning_rate": 4.95e-06,
+ "loss": 0.0237,
+ "step": 99
+ },
+ {
+ "epoch": 1.3732718894009217,
+ "grad_norm": 0.30151981115341187,
+ "learning_rate": 5e-06,
+ "loss": 0.0248,
+ "step": 100
+ },
+ {
+ "epoch": 1.3870967741935485,
+ "grad_norm": 0.3205336630344391,
+ "learning_rate": 4.999888074163108e-06,
+ "loss": 0.0232,
+ "step": 101
+ },
+ {
+ "epoch": 1.400921658986175,
+ "grad_norm": 0.2705315351486206,
+ "learning_rate": 4.999552306674345e-06,
+ "loss": 0.0245,
+ "step": 102
+ },
+ {
+ "epoch": 1.4147465437788018,
+ "grad_norm": 0.2564137578010559,
+ "learning_rate": 4.998992727598557e-06,
+ "loss": 0.0274,
+ "step": 103
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.1967611312866211,
+ "learning_rate": 4.998209387040829e-06,
+ "loss": 0.0173,
+ "step": 104
+ },
+ {
+ "epoch": 1.4423963133640554,
+ "grad_norm": 0.2568240761756897,
+ "learning_rate": 4.9972023551419995e-06,
+ "loss": 0.0223,
+ "step": 105
+ },
+ {
+ "epoch": 1.456221198156682,
+ "grad_norm": 0.2236352413892746,
+ "learning_rate": 4.995971722072379e-06,
+ "loss": 0.0202,
+ "step": 106
+ },
+ {
+ "epoch": 1.4700460829493087,
+ "grad_norm": 0.3389627933502197,
+ "learning_rate": 4.9945175980236745e-06,
+ "loss": 0.0214,
+ "step": 107
+ },
+ {
+ "epoch": 1.4838709677419355,
+ "grad_norm": 0.31428012251853943,
+ "learning_rate": 4.992840113199131e-06,
+ "loss": 0.0188,
+ "step": 108
+ },
+ {
+ "epoch": 1.4976958525345623,
+ "grad_norm": 0.41508516669273376,
+ "learning_rate": 4.990939417801859e-06,
+ "loss": 0.0213,
+ "step": 109
+ },
+ {
+ "epoch": 1.511520737327189,
+ "grad_norm": 0.19615545868873596,
+ "learning_rate": 4.988815682021398e-06,
+ "loss": 0.0191,
+ "step": 110
+ },
+ {
+ "epoch": 1.5253456221198156,
+ "grad_norm": 0.2059931755065918,
+ "learning_rate": 4.986469096018472e-06,
+ "loss": 0.0208,
+ "step": 111
+ },
+ {
+ "epoch": 1.5391705069124424,
+ "grad_norm": 0.26946336030960083,
+ "learning_rate": 4.983899869907963e-06,
+ "loss": 0.0192,
+ "step": 112
+ },
+ {
+ "epoch": 1.5529953917050692,
+ "grad_norm": 0.3227538466453552,
+ "learning_rate": 4.981108233740096e-06,
+ "loss": 0.0169,
+ "step": 113
+ },
+ {
+ "epoch": 1.5668202764976957,
+ "grad_norm": 0.2811918258666992,
+ "learning_rate": 4.978094437479843e-06,
+ "loss": 0.0151,
+ "step": 114
+ },
+ {
+ "epoch": 1.5806451612903225,
+ "grad_norm": 0.32980477809906006,
+ "learning_rate": 4.97485875098454e-06,
+ "loss": 0.0182,
+ "step": 115
+ },
+ {
+ "epoch": 1.5944700460829493,
+ "grad_norm": 0.2759259045124054,
+ "learning_rate": 4.971401463979722e-06,
+ "loss": 0.0192,
+ "step": 116
+ },
+ {
+ "epoch": 1.608294930875576,
+ "grad_norm": 0.2572178840637207,
+ "learning_rate": 4.967722886033181e-06,
+ "loss": 0.0198,
+ "step": 117
+ },
+ {
+ "epoch": 1.6221198156682028,
+ "grad_norm": 0.3238658905029297,
+ "learning_rate": 4.963823346527249e-06,
+ "loss": 0.0186,
+ "step": 118
+ },
+ {
+ "epoch": 1.6359447004608296,
+ "grad_norm": 0.3834918737411499,
+ "learning_rate": 4.959703194629304e-06,
+ "loss": 0.0188,
+ "step": 119
+ },
+ {
+ "epoch": 1.6497695852534562,
+ "grad_norm": 0.23881244659423828,
+ "learning_rate": 4.955362799260507e-06,
+ "loss": 0.0182,
+ "step": 120
+ },
+ {
+ "epoch": 1.663594470046083,
+ "grad_norm": 0.1885918825864792,
+ "learning_rate": 4.950802549062764e-06,
+ "loss": 0.0183,
+ "step": 121
+ },
+ {
+ "epoch": 1.6774193548387095,
+ "grad_norm": 0.34959614276885986,
+ "learning_rate": 4.946022852363932e-06,
+ "loss": 0.0173,
+ "step": 122
+ },
+ {
+ "epoch": 1.6912442396313363,
+ "grad_norm": 0.22990310192108154,
+ "learning_rate": 4.9410241371412525e-06,
+ "loss": 0.0135,
+ "step": 123
+ },
+ {
+ "epoch": 1.705069124423963,
+ "grad_norm": 0.2790350615978241,
+ "learning_rate": 4.935806850983034e-06,
+ "loss": 0.0159,
+ "step": 124
+ },
+ {
+ "epoch": 1.7188940092165899,
+ "grad_norm": 0.3218020796775818,
+ "learning_rate": 4.9303714610485705e-06,
+ "loss": 0.0176,
+ "step": 125
+ },
+ {
+ "epoch": 1.7327188940092166,
+ "grad_norm": 0.2294609695672989,
+ "learning_rate": 4.924718454026318e-06,
+ "loss": 0.0149,
+ "step": 126
+ },
+ {
+ "epoch": 1.7465437788018434,
+ "grad_norm": 0.3427927494049072,
+ "learning_rate": 4.918848336090309e-06,
+ "loss": 0.0165,
+ "step": 127
+ },
+ {
+ "epoch": 1.7603686635944702,
+ "grad_norm": 0.22731825709342957,
+ "learning_rate": 4.912761632854834e-06,
+ "loss": 0.0145,
+ "step": 128
+ },
+ {
+ "epoch": 1.7741935483870968,
+ "grad_norm": 0.35364386439323425,
+ "learning_rate": 4.906458889327375e-06,
+ "loss": 0.0161,
+ "step": 129
+ },
+ {
+ "epoch": 1.7880184331797235,
+ "grad_norm": 0.29476454854011536,
+ "learning_rate": 4.899940669859807e-06,
+ "loss": 0.0154,
+ "step": 130
+ },
+ {
+ "epoch": 1.80184331797235,
+ "grad_norm": 0.28667864203453064,
+ "learning_rate": 4.893207558097867e-06,
+ "loss": 0.0143,
+ "step": 131
+ },
+ {
+ "epoch": 1.8156682027649769,
+ "grad_norm": 0.2731999158859253,
+ "learning_rate": 4.8862601569288885e-06,
+ "loss": 0.0141,
+ "step": 132
+ },
+ {
+ "epoch": 1.8294930875576036,
+ "grad_norm": 0.2670470178127289,
+ "learning_rate": 4.879099088427824e-06,
+ "loss": 0.0131,
+ "step": 133
+ },
+ {
+ "epoch": 1.8433179723502304,
+ "grad_norm": 0.23313525319099426,
+ "learning_rate": 4.871724993801541e-06,
+ "loss": 0.012,
+ "step": 134
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 0.2192607820034027,
+ "learning_rate": 4.864138533331411e-06,
+ "loss": 0.0125,
+ "step": 135
+ },
+ {
+ "epoch": 1.870967741935484,
+ "grad_norm": 0.26603585481643677,
+ "learning_rate": 4.8563403863141825e-06,
+ "loss": 0.0121,
+ "step": 136
+ },
+ {
+ "epoch": 1.8847926267281108,
+ "grad_norm": 0.32500001788139343,
+ "learning_rate": 4.84833125100116e-06,
+ "loss": 0.0116,
+ "step": 137
+ },
+ {
+ "epoch": 1.8986175115207373,
+ "grad_norm": 0.24893291294574738,
+ "learning_rate": 4.840111844535682e-06,
+ "loss": 0.0119,
+ "step": 138
+ },
+ {
+ "epoch": 1.912442396313364,
+ "grad_norm": 0.17670764029026031,
+ "learning_rate": 4.8316829028889076e-06,
+ "loss": 0.0096,
+ "step": 139
+ },
+ {
+ "epoch": 1.9262672811059907,
+ "grad_norm": 0.16747575998306274,
+ "learning_rate": 4.823045180793914e-06,
+ "loss": 0.0113,
+ "step": 140
+ },
+ {
+ "epoch": 1.9400921658986174,
+ "grad_norm": 0.19587458670139313,
+ "learning_rate": 4.8141994516781196e-06,
+ "loss": 0.0111,
+ "step": 141
+ },
+ {
+ "epoch": 1.9539170506912442,
+ "grad_norm": 0.237543985247612,
+ "learning_rate": 4.805146507594034e-06,
+ "loss": 0.0088,
+ "step": 142
+ },
+ {
+ "epoch": 1.967741935483871,
+ "grad_norm": 0.22710399329662323,
+ "learning_rate": 4.7958871591483305e-06,
+ "loss": 0.0085,
+ "step": 143
+ },
+ {
+ "epoch": 1.9815668202764978,
+ "grad_norm": 0.2946629822254181,
+ "learning_rate": 4.786422235429269e-06,
+ "loss": 0.0122,
+ "step": 144
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 432,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 72,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 9.220896013978436e+18,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-144/training_args.bin b/checkpoint-144/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..48c70fd554062e31c1333fa196fcbd6a4f178c6c
--- /dev/null
+++ b/checkpoint-144/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1d334ff15891d240486e54641859af8de96cab43a64ef7bf9dc417387365ae5
+size 7928
diff --git a/checkpoint-144/zero_to_fp32.py b/checkpoint-144/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-144/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-216/README.md b/checkpoint-216/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fdf619c317c2fe82074662582dbd68166b6f9d50
--- /dev/null
+++ b/checkpoint-216/README.md
@@ -0,0 +1,202 @@
+---
+base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-216/adapter_config.json b/checkpoint-216/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3abb5d68d20446d2b99ace226d6233a68590205a
--- /dev/null
+++ b/checkpoint-216/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "up_proj",
+ "down_proj",
+ "gate_proj",
+ "v_proj",
+ "k_proj",
+ "q_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-216/adapter_model.safetensors b/checkpoint-216/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..7986df1a627493ff0d52f56bcaea0208cf341255
--- /dev/null
+++ b/checkpoint-216/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a895406604b92c95bc4bb0ace0ac44fbd7c15114ee5e7626cc6d86e7d7a16f2
+size 10829849744
diff --git a/checkpoint-216/global_step214/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-216/global_step214/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..769afdb095ad6b16a0306d23f91ec8f83a00f66c
--- /dev/null
+++ b/checkpoint-216/global_step214/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb379dddb81a55011668ab46e3f21a5e5ce7090ff898fe8fa2fe2f8fe6efd6a3
+size 21659418140
diff --git a/checkpoint-216/global_step214/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-216/global_step214/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2a340541db469187d7db1241a1400b406618bf0c
--- /dev/null
+++ b/checkpoint-216/global_step214/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab7aafb5607d1ced882755e4091f143c862422f78fdac6dd19fe1e5ace74bb33
+size 21659457372
diff --git a/checkpoint-216/global_step214/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-216/global_step214/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5afbd7a5e9d041510f22a912c9d941898e15085d
--- /dev/null
+++ b/checkpoint-216/global_step214/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6ca70f939f83639ec03065b13309e4449d6d52c7099c178374ad52485d89dbd
+size 21659417820
diff --git a/checkpoint-216/global_step214/mp_rank_00_model_states.pt b/checkpoint-216/global_step214/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5077a2f8aad895eb460ef7c54aa0aebe33e9e57f
--- /dev/null
+++ b/checkpoint-216/global_step214/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4f4742360905034c74c7a6b928a6b96a7c6c3fc1d8c9e349dd3b1e7ddfce515
+size 11918643933
diff --git a/checkpoint-216/latest b/checkpoint-216/latest
new file mode 100644
index 0000000000000000000000000000000000000000..4e37c1a038b3403862a938449a34498d62500618
--- /dev/null
+++ b/checkpoint-216/latest
@@ -0,0 +1 @@
+global_step214
\ No newline at end of file
diff --git a/checkpoint-216/rng_state_0.pth b/checkpoint-216/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ba5a90758511717b18c86dec56dec274c94d97c4
--- /dev/null
+++ b/checkpoint-216/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ed596fc93f566986c2d858ceb6a24c13dfe40c2b6101df11b4cc46fd672586f
+size 14768
diff --git a/checkpoint-216/rng_state_1.pth b/checkpoint-216/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7a3f8036f33cbe77dd6464b3d48d8d636ba8ba27
--- /dev/null
+++ b/checkpoint-216/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24cbc883095047d687cc74366529afb0b786a7226b5ea9db155182b7cc6317da
+size 14768
diff --git a/checkpoint-216/rng_state_2.pth b/checkpoint-216/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..564f05e4f0b5793bc8ea053a987335304a02c0ed
--- /dev/null
+++ b/checkpoint-216/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2975a1b5a2dca31e958a33852c2eb51b98f38898301485c941de812ec9019925
+size 14768
diff --git a/checkpoint-216/scheduler.pt b/checkpoint-216/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5aa48f9e2224074123d70cbfd49c476a8ca21ea3
--- /dev/null
+++ b/checkpoint-216/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a177484072060bf319ca9a44b8c986d20ca392d8b2158584c14221fe24d8381
+size 1064
diff --git a/checkpoint-216/special_tokens_map.json b/checkpoint-216/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-216/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-216/tokenizer.json b/checkpoint-216/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-216/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-216/tokenizer_config.json b/checkpoint-216/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdde94c29816839ec3c29d6c6461206a49911f3c
--- /dev/null
+++ b/checkpoint-216/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-216/trainer_state.json b/checkpoint-216/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5251750818bb2a619a406591f437506fb1373e44
--- /dev/null
+++ b/checkpoint-216/trainer_state.json
@@ -0,0 +1,1545 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.967741935483871,
+ "eval_steps": 500,
+ "global_step": 216,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.013824884792626729,
+ "grad_norm": 34.963134765625,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 2.5476,
+ "step": 1
+ },
+ {
+ "epoch": 0.027649769585253458,
+ "grad_norm": 35.32600021362305,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 2.6058,
+ "step": 2
+ },
+ {
+ "epoch": 0.041474654377880185,
+ "grad_norm": 34.955448150634766,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 2.5871,
+ "step": 3
+ },
+ {
+ "epoch": 0.055299539170506916,
+ "grad_norm": 35.09806442260742,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 2.5912,
+ "step": 4
+ },
+ {
+ "epoch": 0.06912442396313365,
+ "grad_norm": 34.88739776611328,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 2.592,
+ "step": 5
+ },
+ {
+ "epoch": 0.08294930875576037,
+ "grad_norm": 34.84288024902344,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 2.5609,
+ "step": 6
+ },
+ {
+ "epoch": 0.0967741935483871,
+ "grad_norm": 35.0090217590332,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 2.5651,
+ "step": 7
+ },
+ {
+ "epoch": 0.11059907834101383,
+ "grad_norm": 35.03983688354492,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 2.5437,
+ "step": 8
+ },
+ {
+ "epoch": 0.12442396313364056,
+ "grad_norm": 34.802833557128906,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 2.5448,
+ "step": 9
+ },
+ {
+ "epoch": 0.1382488479262673,
+ "grad_norm": 34.5220947265625,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 2.504,
+ "step": 10
+ },
+ {
+ "epoch": 0.15207373271889402,
+ "grad_norm": 34.401580810546875,
+ "learning_rate": 5.5e-07,
+ "loss": 2.4814,
+ "step": 11
+ },
+ {
+ "epoch": 0.16589861751152074,
+ "grad_norm": 33.76997375488281,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 2.4282,
+ "step": 12
+ },
+ {
+ "epoch": 0.17972350230414746,
+ "grad_norm": 33.53415298461914,
+ "learning_rate": 6.5e-07,
+ "loss": 2.4216,
+ "step": 13
+ },
+ {
+ "epoch": 0.1935483870967742,
+ "grad_norm": 32.401580810546875,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 2.3362,
+ "step": 14
+ },
+ {
+ "epoch": 0.2073732718894009,
+ "grad_norm": 33.636661529541016,
+ "learning_rate": 7.5e-07,
+ "loss": 2.2978,
+ "step": 15
+ },
+ {
+ "epoch": 0.22119815668202766,
+ "grad_norm": 31.3782901763916,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 2.1358,
+ "step": 16
+ },
+ {
+ "epoch": 0.2350230414746544,
+ "grad_norm": 30.72391700744629,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 2.0652,
+ "step": 17
+ },
+ {
+ "epoch": 0.2488479262672811,
+ "grad_norm": 30.817584991455078,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 2.0115,
+ "step": 18
+ },
+ {
+ "epoch": 0.2626728110599078,
+ "grad_norm": 29.683996200561523,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 1.8668,
+ "step": 19
+ },
+ {
+ "epoch": 0.2764976958525346,
+ "grad_norm": 29.506683349609375,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.7796,
+ "step": 20
+ },
+ {
+ "epoch": 0.2903225806451613,
+ "grad_norm": 27.55340003967285,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 1.5656,
+ "step": 21
+ },
+ {
+ "epoch": 0.30414746543778803,
+ "grad_norm": 27.78036880493164,
+ "learning_rate": 1.1e-06,
+ "loss": 1.5112,
+ "step": 22
+ },
+ {
+ "epoch": 0.31797235023041476,
+ "grad_norm": 26.36115264892578,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 1.3283,
+ "step": 23
+ },
+ {
+ "epoch": 0.3317972350230415,
+ "grad_norm": 25.388761520385742,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 1.137,
+ "step": 24
+ },
+ {
+ "epoch": 0.3456221198156682,
+ "grad_norm": 25.21432876586914,
+ "learning_rate": 1.25e-06,
+ "loss": 0.9867,
+ "step": 25
+ },
+ {
+ "epoch": 0.35944700460829493,
+ "grad_norm": 24.924489974975586,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7122,
+ "step": 26
+ },
+ {
+ "epoch": 0.37327188940092165,
+ "grad_norm": 21.881420135498047,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.4952,
+ "step": 27
+ },
+ {
+ "epoch": 0.3870967741935484,
+ "grad_norm": 17.67154884338379,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.3602,
+ "step": 28
+ },
+ {
+ "epoch": 0.4009216589861751,
+ "grad_norm": 11.489490509033203,
+ "learning_rate": 1.45e-06,
+ "loss": 0.2432,
+ "step": 29
+ },
+ {
+ "epoch": 0.4147465437788018,
+ "grad_norm": 7.622438907623291,
+ "learning_rate": 1.5e-06,
+ "loss": 0.189,
+ "step": 30
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 4.340638637542725,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.1302,
+ "step": 31
+ },
+ {
+ "epoch": 0.4423963133640553,
+ "grad_norm": 3.079514980316162,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.1075,
+ "step": 32
+ },
+ {
+ "epoch": 0.45622119815668205,
+ "grad_norm": 2.355943441390991,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.0998,
+ "step": 33
+ },
+ {
+ "epoch": 0.4700460829493088,
+ "grad_norm": 1.9480725526809692,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.0926,
+ "step": 34
+ },
+ {
+ "epoch": 0.4838709677419355,
+ "grad_norm": 1.8598166704177856,
+ "learning_rate": 1.75e-06,
+ "loss": 0.0733,
+ "step": 35
+ },
+ {
+ "epoch": 0.4976958525345622,
+ "grad_norm": 0.9892730712890625,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.0664,
+ "step": 36
+ },
+ {
+ "epoch": 0.511520737327189,
+ "grad_norm": 0.8992418050765991,
+ "learning_rate": 1.85e-06,
+ "loss": 0.0709,
+ "step": 37
+ },
+ {
+ "epoch": 0.5253456221198156,
+ "grad_norm": 0.7340101599693298,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.0535,
+ "step": 38
+ },
+ {
+ "epoch": 0.5391705069124424,
+ "grad_norm": 0.7032178044319153,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.0573,
+ "step": 39
+ },
+ {
+ "epoch": 0.5529953917050692,
+ "grad_norm": 0.6449429392814636,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.0576,
+ "step": 40
+ },
+ {
+ "epoch": 0.5668202764976958,
+ "grad_norm": 0.6358592510223389,
+ "learning_rate": 2.05e-06,
+ "loss": 0.0502,
+ "step": 41
+ },
+ {
+ "epoch": 0.5806451612903226,
+ "grad_norm": 0.572036623954773,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.0556,
+ "step": 42
+ },
+ {
+ "epoch": 0.5944700460829493,
+ "grad_norm": 0.6538863778114319,
+ "learning_rate": 2.15e-06,
+ "loss": 0.0556,
+ "step": 43
+ },
+ {
+ "epoch": 0.6082949308755761,
+ "grad_norm": 0.3532159626483917,
+ "learning_rate": 2.2e-06,
+ "loss": 0.0452,
+ "step": 44
+ },
+ {
+ "epoch": 0.6221198156682027,
+ "grad_norm": 0.4853012263774872,
+ "learning_rate": 2.25e-06,
+ "loss": 0.0471,
+ "step": 45
+ },
+ {
+ "epoch": 0.6359447004608295,
+ "grad_norm": 0.4761648178100586,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.0469,
+ "step": 46
+ },
+ {
+ "epoch": 0.6497695852534562,
+ "grad_norm": 0.6094638109207153,
+ "learning_rate": 2.35e-06,
+ "loss": 0.047,
+ "step": 47
+ },
+ {
+ "epoch": 0.663594470046083,
+ "grad_norm": 0.5211306214332581,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.0402,
+ "step": 48
+ },
+ {
+ "epoch": 0.6774193548387096,
+ "grad_norm": 0.2997778356075287,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.0425,
+ "step": 49
+ },
+ {
+ "epoch": 0.6912442396313364,
+ "grad_norm": 0.37834689021110535,
+ "learning_rate": 2.5e-06,
+ "loss": 0.0446,
+ "step": 50
+ },
+ {
+ "epoch": 0.7050691244239631,
+ "grad_norm": 0.31011995673179626,
+ "learning_rate": 2.55e-06,
+ "loss": 0.0406,
+ "step": 51
+ },
+ {
+ "epoch": 0.7188940092165899,
+ "grad_norm": 0.3113131523132324,
+ "learning_rate": 2.6e-06,
+ "loss": 0.0368,
+ "step": 52
+ },
+ {
+ "epoch": 0.7327188940092166,
+ "grad_norm": 0.5685846209526062,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.0389,
+ "step": 53
+ },
+ {
+ "epoch": 0.7465437788018433,
+ "grad_norm": 0.29334983229637146,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.0423,
+ "step": 54
+ },
+ {
+ "epoch": 0.7603686635944701,
+ "grad_norm": 0.5776861906051636,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.0399,
+ "step": 55
+ },
+ {
+ "epoch": 0.7741935483870968,
+ "grad_norm": 0.35423165559768677,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.0357,
+ "step": 56
+ },
+ {
+ "epoch": 0.7880184331797235,
+ "grad_norm": 0.37902742624282837,
+ "learning_rate": 2.85e-06,
+ "loss": 0.0407,
+ "step": 57
+ },
+ {
+ "epoch": 0.8018433179723502,
+ "grad_norm": 0.26948878169059753,
+ "learning_rate": 2.9e-06,
+ "loss": 0.0351,
+ "step": 58
+ },
+ {
+ "epoch": 0.815668202764977,
+ "grad_norm": 0.35688117146492004,
+ "learning_rate": 2.95e-06,
+ "loss": 0.0377,
+ "step": 59
+ },
+ {
+ "epoch": 0.8294930875576036,
+ "grad_norm": 0.5287911891937256,
+ "learning_rate": 3e-06,
+ "loss": 0.0377,
+ "step": 60
+ },
+ {
+ "epoch": 0.8433179723502304,
+ "grad_norm": 0.2950785756111145,
+ "learning_rate": 3.05e-06,
+ "loss": 0.0361,
+ "step": 61
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 0.2789723575115204,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.032,
+ "step": 62
+ },
+ {
+ "epoch": 0.8709677419354839,
+ "grad_norm": 0.2802198529243469,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.0394,
+ "step": 63
+ },
+ {
+ "epoch": 0.8847926267281107,
+ "grad_norm": 0.286981463432312,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.033,
+ "step": 64
+ },
+ {
+ "epoch": 0.8986175115207373,
+ "grad_norm": 0.37392762303352356,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.0335,
+ "step": 65
+ },
+ {
+ "epoch": 0.9124423963133641,
+ "grad_norm": 0.25025588274002075,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.0311,
+ "step": 66
+ },
+ {
+ "epoch": 0.9262672811059908,
+ "grad_norm": 0.4292861521244049,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.0362,
+ "step": 67
+ },
+ {
+ "epoch": 0.9400921658986175,
+ "grad_norm": 0.4717651307582855,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.0303,
+ "step": 68
+ },
+ {
+ "epoch": 0.9539170506912442,
+ "grad_norm": 0.49291253089904785,
+ "learning_rate": 3.45e-06,
+ "loss": 0.0352,
+ "step": 69
+ },
+ {
+ "epoch": 0.967741935483871,
+ "grad_norm": 0.3729935586452484,
+ "learning_rate": 3.5e-06,
+ "loss": 0.0297,
+ "step": 70
+ },
+ {
+ "epoch": 0.9815668202764977,
+ "grad_norm": 0.27150583267211914,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.0326,
+ "step": 71
+ },
+ {
+ "epoch": 0.9953917050691244,
+ "grad_norm": 0.34516096115112305,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.0336,
+ "step": 72
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 0.34516096115112305,
+ "learning_rate": 3.65e-06,
+ "loss": 0.0274,
+ "step": 73
+ },
+ {
+ "epoch": 1.0138248847926268,
+ "grad_norm": 0.6282734870910645,
+ "learning_rate": 3.7e-06,
+ "loss": 0.0289,
+ "step": 74
+ },
+ {
+ "epoch": 1.0276497695852536,
+ "grad_norm": 0.2935558557510376,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.0308,
+ "step": 75
+ },
+ {
+ "epoch": 1.0414746543778801,
+ "grad_norm": 0.3166769742965698,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.0277,
+ "step": 76
+ },
+ {
+ "epoch": 1.055299539170507,
+ "grad_norm": 0.38190239667892456,
+ "learning_rate": 3.85e-06,
+ "loss": 0.0338,
+ "step": 77
+ },
+ {
+ "epoch": 1.0691244239631337,
+ "grad_norm": 0.2779421806335449,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.03,
+ "step": 78
+ },
+ {
+ "epoch": 1.0829493087557605,
+ "grad_norm": 0.4055996537208557,
+ "learning_rate": 3.95e-06,
+ "loss": 0.0295,
+ "step": 79
+ },
+ {
+ "epoch": 1.096774193548387,
+ "grad_norm": 0.2987312972545624,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.028,
+ "step": 80
+ },
+ {
+ "epoch": 1.1105990783410138,
+ "grad_norm": 0.2674776017665863,
+ "learning_rate": 4.05e-06,
+ "loss": 0.0243,
+ "step": 81
+ },
+ {
+ "epoch": 1.1244239631336406,
+ "grad_norm": 0.29042816162109375,
+ "learning_rate": 4.1e-06,
+ "loss": 0.0318,
+ "step": 82
+ },
+ {
+ "epoch": 1.1382488479262673,
+ "grad_norm": 0.2904883027076721,
+ "learning_rate": 4.15e-06,
+ "loss": 0.0257,
+ "step": 83
+ },
+ {
+ "epoch": 1.1520737327188941,
+ "grad_norm": 0.30603015422821045,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.0284,
+ "step": 84
+ },
+ {
+ "epoch": 1.1658986175115207,
+ "grad_norm": 0.23131045699119568,
+ "learning_rate": 4.25e-06,
+ "loss": 0.0285,
+ "step": 85
+ },
+ {
+ "epoch": 1.1797235023041475,
+ "grad_norm": 0.26788002252578735,
+ "learning_rate": 4.3e-06,
+ "loss": 0.0269,
+ "step": 86
+ },
+ {
+ "epoch": 1.1935483870967742,
+ "grad_norm": 0.2639651894569397,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.0289,
+ "step": 87
+ },
+ {
+ "epoch": 1.2073732718894008,
+ "grad_norm": 0.25068584084510803,
+ "learning_rate": 4.4e-06,
+ "loss": 0.0275,
+ "step": 88
+ },
+ {
+ "epoch": 1.2211981566820276,
+ "grad_norm": 0.25494542717933655,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.0275,
+ "step": 89
+ },
+ {
+ "epoch": 1.2350230414746544,
+ "grad_norm": 0.31125035881996155,
+ "learning_rate": 4.5e-06,
+ "loss": 0.0251,
+ "step": 90
+ },
+ {
+ "epoch": 1.2488479262672811,
+ "grad_norm": 0.2691773474216461,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.0267,
+ "step": 91
+ },
+ {
+ "epoch": 1.262672811059908,
+ "grad_norm": 0.20079147815704346,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.0263,
+ "step": 92
+ },
+ {
+ "epoch": 1.2764976958525347,
+ "grad_norm": 0.28027331829071045,
+ "learning_rate": 4.65e-06,
+ "loss": 0.0227,
+ "step": 93
+ },
+ {
+ "epoch": 1.2903225806451613,
+ "grad_norm": 0.40053099393844604,
+ "learning_rate": 4.7e-06,
+ "loss": 0.0246,
+ "step": 94
+ },
+ {
+ "epoch": 1.304147465437788,
+ "grad_norm": 0.33066362142562866,
+ "learning_rate": 4.75e-06,
+ "loss": 0.0221,
+ "step": 95
+ },
+ {
+ "epoch": 1.3179723502304148,
+ "grad_norm": 0.2531339228153229,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.0216,
+ "step": 96
+ },
+ {
+ "epoch": 1.3317972350230414,
+ "grad_norm": 0.37544378638267517,
+ "learning_rate": 4.85e-06,
+ "loss": 0.0247,
+ "step": 97
+ },
+ {
+ "epoch": 1.3456221198156681,
+ "grad_norm": 0.34273672103881836,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.0217,
+ "step": 98
+ },
+ {
+ "epoch": 1.359447004608295,
+ "grad_norm": 0.2338661253452301,
+ "learning_rate": 4.95e-06,
+ "loss": 0.0237,
+ "step": 99
+ },
+ {
+ "epoch": 1.3732718894009217,
+ "grad_norm": 0.30151981115341187,
+ "learning_rate": 5e-06,
+ "loss": 0.0248,
+ "step": 100
+ },
+ {
+ "epoch": 1.3870967741935485,
+ "grad_norm": 0.3205336630344391,
+ "learning_rate": 4.999888074163108e-06,
+ "loss": 0.0232,
+ "step": 101
+ },
+ {
+ "epoch": 1.400921658986175,
+ "grad_norm": 0.2705315351486206,
+ "learning_rate": 4.999552306674345e-06,
+ "loss": 0.0245,
+ "step": 102
+ },
+ {
+ "epoch": 1.4147465437788018,
+ "grad_norm": 0.2564137578010559,
+ "learning_rate": 4.998992727598557e-06,
+ "loss": 0.0274,
+ "step": 103
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.1967611312866211,
+ "learning_rate": 4.998209387040829e-06,
+ "loss": 0.0173,
+ "step": 104
+ },
+ {
+ "epoch": 1.4423963133640554,
+ "grad_norm": 0.2568240761756897,
+ "learning_rate": 4.9972023551419995e-06,
+ "loss": 0.0223,
+ "step": 105
+ },
+ {
+ "epoch": 1.456221198156682,
+ "grad_norm": 0.2236352413892746,
+ "learning_rate": 4.995971722072379e-06,
+ "loss": 0.0202,
+ "step": 106
+ },
+ {
+ "epoch": 1.4700460829493087,
+ "grad_norm": 0.3389627933502197,
+ "learning_rate": 4.9945175980236745e-06,
+ "loss": 0.0214,
+ "step": 107
+ },
+ {
+ "epoch": 1.4838709677419355,
+ "grad_norm": 0.31428012251853943,
+ "learning_rate": 4.992840113199131e-06,
+ "loss": 0.0188,
+ "step": 108
+ },
+ {
+ "epoch": 1.4976958525345623,
+ "grad_norm": 0.41508516669273376,
+ "learning_rate": 4.990939417801859e-06,
+ "loss": 0.0213,
+ "step": 109
+ },
+ {
+ "epoch": 1.511520737327189,
+ "grad_norm": 0.19615545868873596,
+ "learning_rate": 4.988815682021398e-06,
+ "loss": 0.0191,
+ "step": 110
+ },
+ {
+ "epoch": 1.5253456221198156,
+ "grad_norm": 0.2059931755065918,
+ "learning_rate": 4.986469096018472e-06,
+ "loss": 0.0208,
+ "step": 111
+ },
+ {
+ "epoch": 1.5391705069124424,
+ "grad_norm": 0.26946336030960083,
+ "learning_rate": 4.983899869907963e-06,
+ "loss": 0.0192,
+ "step": 112
+ },
+ {
+ "epoch": 1.5529953917050692,
+ "grad_norm": 0.3227538466453552,
+ "learning_rate": 4.981108233740096e-06,
+ "loss": 0.0169,
+ "step": 113
+ },
+ {
+ "epoch": 1.5668202764976957,
+ "grad_norm": 0.2811918258666992,
+ "learning_rate": 4.978094437479843e-06,
+ "loss": 0.0151,
+ "step": 114
+ },
+ {
+ "epoch": 1.5806451612903225,
+ "grad_norm": 0.32980477809906006,
+ "learning_rate": 4.97485875098454e-06,
+ "loss": 0.0182,
+ "step": 115
+ },
+ {
+ "epoch": 1.5944700460829493,
+ "grad_norm": 0.2759259045124054,
+ "learning_rate": 4.971401463979722e-06,
+ "loss": 0.0192,
+ "step": 116
+ },
+ {
+ "epoch": 1.608294930875576,
+ "grad_norm": 0.2572178840637207,
+ "learning_rate": 4.967722886033181e-06,
+ "loss": 0.0198,
+ "step": 117
+ },
+ {
+ "epoch": 1.6221198156682028,
+ "grad_norm": 0.3238658905029297,
+ "learning_rate": 4.963823346527249e-06,
+ "loss": 0.0186,
+ "step": 118
+ },
+ {
+ "epoch": 1.6359447004608296,
+ "grad_norm": 0.3834918737411499,
+ "learning_rate": 4.959703194629304e-06,
+ "loss": 0.0188,
+ "step": 119
+ },
+ {
+ "epoch": 1.6497695852534562,
+ "grad_norm": 0.23881244659423828,
+ "learning_rate": 4.955362799260507e-06,
+ "loss": 0.0182,
+ "step": 120
+ },
+ {
+ "epoch": 1.663594470046083,
+ "grad_norm": 0.1885918825864792,
+ "learning_rate": 4.950802549062764e-06,
+ "loss": 0.0183,
+ "step": 121
+ },
+ {
+ "epoch": 1.6774193548387095,
+ "grad_norm": 0.34959614276885986,
+ "learning_rate": 4.946022852363932e-06,
+ "loss": 0.0173,
+ "step": 122
+ },
+ {
+ "epoch": 1.6912442396313363,
+ "grad_norm": 0.22990310192108154,
+ "learning_rate": 4.9410241371412525e-06,
+ "loss": 0.0135,
+ "step": 123
+ },
+ {
+ "epoch": 1.705069124423963,
+ "grad_norm": 0.2790350615978241,
+ "learning_rate": 4.935806850983034e-06,
+ "loss": 0.0159,
+ "step": 124
+ },
+ {
+ "epoch": 1.7188940092165899,
+ "grad_norm": 0.3218020796775818,
+ "learning_rate": 4.9303714610485705e-06,
+ "loss": 0.0176,
+ "step": 125
+ },
+ {
+ "epoch": 1.7327188940092166,
+ "grad_norm": 0.2294609695672989,
+ "learning_rate": 4.924718454026318e-06,
+ "loss": 0.0149,
+ "step": 126
+ },
+ {
+ "epoch": 1.7465437788018434,
+ "grad_norm": 0.3427927494049072,
+ "learning_rate": 4.918848336090309e-06,
+ "loss": 0.0165,
+ "step": 127
+ },
+ {
+ "epoch": 1.7603686635944702,
+ "grad_norm": 0.22731825709342957,
+ "learning_rate": 4.912761632854834e-06,
+ "loss": 0.0145,
+ "step": 128
+ },
+ {
+ "epoch": 1.7741935483870968,
+ "grad_norm": 0.35364386439323425,
+ "learning_rate": 4.906458889327375e-06,
+ "loss": 0.0161,
+ "step": 129
+ },
+ {
+ "epoch": 1.7880184331797235,
+ "grad_norm": 0.29476454854011536,
+ "learning_rate": 4.899940669859807e-06,
+ "loss": 0.0154,
+ "step": 130
+ },
+ {
+ "epoch": 1.80184331797235,
+ "grad_norm": 0.28667864203453064,
+ "learning_rate": 4.893207558097867e-06,
+ "loss": 0.0143,
+ "step": 131
+ },
+ {
+ "epoch": 1.8156682027649769,
+ "grad_norm": 0.2731999158859253,
+ "learning_rate": 4.8862601569288885e-06,
+ "loss": 0.0141,
+ "step": 132
+ },
+ {
+ "epoch": 1.8294930875576036,
+ "grad_norm": 0.2670470178127289,
+ "learning_rate": 4.879099088427824e-06,
+ "loss": 0.0131,
+ "step": 133
+ },
+ {
+ "epoch": 1.8433179723502304,
+ "grad_norm": 0.23313525319099426,
+ "learning_rate": 4.871724993801541e-06,
+ "loss": 0.012,
+ "step": 134
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 0.2192607820034027,
+ "learning_rate": 4.864138533331411e-06,
+ "loss": 0.0125,
+ "step": 135
+ },
+ {
+ "epoch": 1.870967741935484,
+ "grad_norm": 0.26603585481643677,
+ "learning_rate": 4.8563403863141825e-06,
+ "loss": 0.0121,
+ "step": 136
+ },
+ {
+ "epoch": 1.8847926267281108,
+ "grad_norm": 0.32500001788139343,
+ "learning_rate": 4.84833125100116e-06,
+ "loss": 0.0116,
+ "step": 137
+ },
+ {
+ "epoch": 1.8986175115207373,
+ "grad_norm": 0.24893291294574738,
+ "learning_rate": 4.840111844535682e-06,
+ "loss": 0.0119,
+ "step": 138
+ },
+ {
+ "epoch": 1.912442396313364,
+ "grad_norm": 0.17670764029026031,
+ "learning_rate": 4.8316829028889076e-06,
+ "loss": 0.0096,
+ "step": 139
+ },
+ {
+ "epoch": 1.9262672811059907,
+ "grad_norm": 0.16747575998306274,
+ "learning_rate": 4.823045180793914e-06,
+ "loss": 0.0113,
+ "step": 140
+ },
+ {
+ "epoch": 1.9400921658986174,
+ "grad_norm": 0.19587458670139313,
+ "learning_rate": 4.8141994516781196e-06,
+ "loss": 0.0111,
+ "step": 141
+ },
+ {
+ "epoch": 1.9539170506912442,
+ "grad_norm": 0.237543985247612,
+ "learning_rate": 4.805146507594034e-06,
+ "loss": 0.0088,
+ "step": 142
+ },
+ {
+ "epoch": 1.967741935483871,
+ "grad_norm": 0.22710399329662323,
+ "learning_rate": 4.7958871591483305e-06,
+ "loss": 0.0085,
+ "step": 143
+ },
+ {
+ "epoch": 1.9815668202764978,
+ "grad_norm": 0.2946629822254181,
+ "learning_rate": 4.786422235429269e-06,
+ "loss": 0.0122,
+ "step": 144
+ },
+ {
+ "epoch": 1.9953917050691246,
+ "grad_norm": 0.2763853371143341,
+ "learning_rate": 4.776752583932455e-06,
+ "loss": 0.0118,
+ "step": 145
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.2763853371143341,
+ "learning_rate": 4.766879070484957e-06,
+ "loss": 0.0078,
+ "step": 146
+ },
+ {
+ "epoch": 2.013824884792627,
+ "grad_norm": 0.2722196877002716,
+ "learning_rate": 4.756802579167781e-06,
+ "loss": 0.0076,
+ "step": 147
+ },
+ {
+ "epoch": 2.0276497695852536,
+ "grad_norm": 0.18556565046310425,
+ "learning_rate": 4.746524012236706e-06,
+ "loss": 0.0091,
+ "step": 148
+ },
+ {
+ "epoch": 2.0414746543778803,
+ "grad_norm": 0.24442361295223236,
+ "learning_rate": 4.736044290041496e-06,
+ "loss": 0.009,
+ "step": 149
+ },
+ {
+ "epoch": 2.055299539170507,
+ "grad_norm": 0.24207571148872375,
+ "learning_rate": 4.725364350943492e-06,
+ "loss": 0.0085,
+ "step": 150
+ },
+ {
+ "epoch": 2.0691244239631335,
+ "grad_norm": 0.18502290546894073,
+ "learning_rate": 4.714485151231593e-06,
+ "loss": 0.0059,
+ "step": 151
+ },
+ {
+ "epoch": 2.0829493087557602,
+ "grad_norm": 0.3010450303554535,
+ "learning_rate": 4.703407665036622e-06,
+ "loss": 0.0071,
+ "step": 152
+ },
+ {
+ "epoch": 2.096774193548387,
+ "grad_norm": 0.23272967338562012,
+ "learning_rate": 4.692132884244113e-06,
+ "loss": 0.0074,
+ "step": 153
+ },
+ {
+ "epoch": 2.110599078341014,
+ "grad_norm": 0.25476181507110596,
+ "learning_rate": 4.680661818405485e-06,
+ "loss": 0.0082,
+ "step": 154
+ },
+ {
+ "epoch": 2.1244239631336406,
+ "grad_norm": 0.24534538388252258,
+ "learning_rate": 4.668995494647653e-06,
+ "loss": 0.0065,
+ "step": 155
+ },
+ {
+ "epoch": 2.1382488479262673,
+ "grad_norm": 0.1642732173204422,
+ "learning_rate": 4.657134957581057e-06,
+ "loss": 0.0054,
+ "step": 156
+ },
+ {
+ "epoch": 2.152073732718894,
+ "grad_norm": 0.21100501716136932,
+ "learning_rate": 4.645081269206128e-06,
+ "loss": 0.0091,
+ "step": 157
+ },
+ {
+ "epoch": 2.165898617511521,
+ "grad_norm": 0.19043587148189545,
+ "learning_rate": 4.632835508818192e-06,
+ "loss": 0.0047,
+ "step": 158
+ },
+ {
+ "epoch": 2.1797235023041477,
+ "grad_norm": 0.1804375797510147,
+ "learning_rate": 4.620398772910833e-06,
+ "loss": 0.0068,
+ "step": 159
+ },
+ {
+ "epoch": 2.193548387096774,
+ "grad_norm": 0.6586657762527466,
+ "learning_rate": 4.607772175077712e-06,
+ "loss": 0.0049,
+ "step": 160
+ },
+ {
+ "epoch": 2.207373271889401,
+ "grad_norm": 0.18181656301021576,
+ "learning_rate": 4.59495684591285e-06,
+ "loss": 0.0071,
+ "step": 161
+ },
+ {
+ "epoch": 2.2211981566820276,
+ "grad_norm": 0.760053813457489,
+ "learning_rate": 4.581953932909403e-06,
+ "loss": 0.0065,
+ "step": 162
+ },
+ {
+ "epoch": 2.2350230414746544,
+ "grad_norm": 0.1935238242149353,
+ "learning_rate": 4.5687646003569055e-06,
+ "loss": 0.0066,
+ "step": 163
+ },
+ {
+ "epoch": 2.248847926267281,
+ "grad_norm": 0.3035024404525757,
+ "learning_rate": 4.555390029237026e-06,
+ "loss": 0.0046,
+ "step": 164
+ },
+ {
+ "epoch": 2.262672811059908,
+ "grad_norm": 0.16596420109272003,
+ "learning_rate": 4.541831417117815e-06,
+ "loss": 0.007,
+ "step": 165
+ },
+ {
+ "epoch": 2.2764976958525347,
+ "grad_norm": 0.2578873336315155,
+ "learning_rate": 4.528089978046481e-06,
+ "loss": 0.0048,
+ "step": 166
+ },
+ {
+ "epoch": 2.2903225806451615,
+ "grad_norm": 1.7751781940460205,
+ "learning_rate": 4.514166942440679e-06,
+ "loss": 0.0041,
+ "step": 167
+ },
+ {
+ "epoch": 2.3041474654377883,
+ "grad_norm": 0.37872445583343506,
+ "learning_rate": 4.5000635569783365e-06,
+ "loss": 0.0045,
+ "step": 168
+ },
+ {
+ "epoch": 2.3179723502304146,
+ "grad_norm": 0.22949594259262085,
+ "learning_rate": 4.4857810844860325e-06,
+ "loss": 0.0071,
+ "step": 169
+ },
+ {
+ "epoch": 2.3317972350230414,
+ "grad_norm": 0.34662699699401855,
+ "learning_rate": 4.471320803825915e-06,
+ "loss": 0.006,
+ "step": 170
+ },
+ {
+ "epoch": 2.345622119815668,
+ "grad_norm": 0.5892661213874817,
+ "learning_rate": 4.4566840097811956e-06,
+ "loss": 0.0055,
+ "step": 171
+ },
+ {
+ "epoch": 2.359447004608295,
+ "grad_norm": 0.18866907060146332,
+ "learning_rate": 4.4418720129402145e-06,
+ "loss": 0.0036,
+ "step": 172
+ },
+ {
+ "epoch": 2.3732718894009217,
+ "grad_norm": 0.1510942429304123,
+ "learning_rate": 4.426886139579083e-06,
+ "loss": 0.0065,
+ "step": 173
+ },
+ {
+ "epoch": 2.3870967741935485,
+ "grad_norm": 0.21291828155517578,
+ "learning_rate": 4.411727731542937e-06,
+ "loss": 0.004,
+ "step": 174
+ },
+ {
+ "epoch": 2.4009216589861753,
+ "grad_norm": 0.18649035692214966,
+ "learning_rate": 4.39639814612578e-06,
+ "loss": 0.0047,
+ "step": 175
+ },
+ {
+ "epoch": 2.4147465437788016,
+ "grad_norm": 0.19008278846740723,
+ "learning_rate": 4.3808987559489536e-06,
+ "loss": 0.0071,
+ "step": 176
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.26282456517219543,
+ "learning_rate": 4.365230948838232e-06,
+ "loss": 0.0044,
+ "step": 177
+ },
+ {
+ "epoch": 2.442396313364055,
+ "grad_norm": 0.2351403385400772,
+ "learning_rate": 4.349396127699552e-06,
+ "loss": 0.0057,
+ "step": 178
+ },
+ {
+ "epoch": 2.456221198156682,
+ "grad_norm": 0.20451441407203674,
+ "learning_rate": 4.3333957103934025e-06,
+ "loss": 0.003,
+ "step": 179
+ },
+ {
+ "epoch": 2.4700460829493087,
+ "grad_norm": 0.22120380401611328,
+ "learning_rate": 4.317231129607859e-06,
+ "loss": 0.0045,
+ "step": 180
+ },
+ {
+ "epoch": 2.4838709677419355,
+ "grad_norm": 0.18543967604637146,
+ "learning_rate": 4.30090383273031e-06,
+ "loss": 0.0062,
+ "step": 181
+ },
+ {
+ "epoch": 2.4976958525345623,
+ "grad_norm": 0.18473730981349945,
+ "learning_rate": 4.2844152817178476e-06,
+ "loss": 0.0052,
+ "step": 182
+ },
+ {
+ "epoch": 2.511520737327189,
+ "grad_norm": 0.21087361872196198,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.0041,
+ "step": 183
+ },
+ {
+ "epoch": 2.525345622119816,
+ "grad_norm": 0.24977360665798187,
+ "learning_rate": 4.2509603371783776e-06,
+ "loss": 0.0053,
+ "step": 184
+ },
+ {
+ "epoch": 2.539170506912442,
+ "grad_norm": 0.19377018511295319,
+ "learning_rate": 4.233996939229502e-06,
+ "loss": 0.0037,
+ "step": 185
+ },
+ {
+ "epoch": 2.5529953917050694,
+ "grad_norm": 0.21130548417568207,
+ "learning_rate": 4.216878278033753e-06,
+ "loss": 0.0031,
+ "step": 186
+ },
+ {
+ "epoch": 2.5668202764976957,
+ "grad_norm": 0.13288047909736633,
+ "learning_rate": 4.199605886407515e-06,
+ "loss": 0.0029,
+ "step": 187
+ },
+ {
+ "epoch": 2.5806451612903225,
+ "grad_norm": 0.15998876094818115,
+ "learning_rate": 4.1821813109322975e-06,
+ "loss": 0.0031,
+ "step": 188
+ },
+ {
+ "epoch": 2.5944700460829493,
+ "grad_norm": 0.19475506246089935,
+ "learning_rate": 4.164606111816256e-06,
+ "loss": 0.0022,
+ "step": 189
+ },
+ {
+ "epoch": 2.608294930875576,
+ "grad_norm": 0.1446300446987152,
+ "learning_rate": 4.146881862754485e-06,
+ "loss": 0.0025,
+ "step": 190
+ },
+ {
+ "epoch": 2.622119815668203,
+ "grad_norm": 0.13051164150238037,
+ "learning_rate": 4.129010150788112e-06,
+ "loss": 0.0019,
+ "step": 191
+ },
+ {
+ "epoch": 2.6359447004608296,
+ "grad_norm": 0.1953984946012497,
+ "learning_rate": 4.110992576162193e-06,
+ "loss": 0.0021,
+ "step": 192
+ },
+ {
+ "epoch": 2.6497695852534564,
+ "grad_norm": 0.23630598187446594,
+ "learning_rate": 4.092830752182423e-06,
+ "loss": 0.002,
+ "step": 193
+ },
+ {
+ "epoch": 2.6635944700460827,
+ "grad_norm": 0.2919062376022339,
+ "learning_rate": 4.074526305070679e-06,
+ "loss": 0.0017,
+ "step": 194
+ },
+ {
+ "epoch": 2.6774193548387095,
+ "grad_norm": 0.22015534341335297,
+ "learning_rate": 4.056080873819412e-06,
+ "loss": 0.0025,
+ "step": 195
+ },
+ {
+ "epoch": 2.6912442396313363,
+ "grad_norm": 0.9449160099029541,
+ "learning_rate": 4.037496110044885e-06,
+ "loss": 0.0024,
+ "step": 196
+ },
+ {
+ "epoch": 2.705069124423963,
+ "grad_norm": 0.25235581398010254,
+ "learning_rate": 4.018773677839289e-06,
+ "loss": 0.0031,
+ "step": 197
+ },
+ {
+ "epoch": 2.71889400921659,
+ "grad_norm": 0.3098089098930359,
+ "learning_rate": 3.999915253621739e-06,
+ "loss": 0.0019,
+ "step": 198
+ },
+ {
+ "epoch": 2.7327188940092166,
+ "grad_norm": 0.19896291196346283,
+ "learning_rate": 3.980922525988167e-06,
+ "loss": 0.0019,
+ "step": 199
+ },
+ {
+ "epoch": 2.7465437788018434,
+ "grad_norm": 0.21136268973350525,
+ "learning_rate": 3.961797195560118e-06,
+ "loss": 0.0031,
+ "step": 200
+ },
+ {
+ "epoch": 2.76036866359447,
+ "grad_norm": 0.2549005150794983,
+ "learning_rate": 3.942540974832486e-06,
+ "loss": 0.0017,
+ "step": 201
+ },
+ {
+ "epoch": 2.774193548387097,
+ "grad_norm": 0.14762410521507263,
+ "learning_rate": 3.9231555880201655e-06,
+ "loss": 0.0022,
+ "step": 202
+ },
+ {
+ "epoch": 2.7880184331797233,
+ "grad_norm": 0.16235944628715515,
+ "learning_rate": 3.903642770903671e-06,
+ "loss": 0.0012,
+ "step": 203
+ },
+ {
+ "epoch": 2.80184331797235,
+ "grad_norm": 0.1506718099117279,
+ "learning_rate": 3.884004270673711e-06,
+ "loss": 0.0015,
+ "step": 204
+ },
+ {
+ "epoch": 2.815668202764977,
+ "grad_norm": 0.10484135895967484,
+ "learning_rate": 3.864241845774746e-06,
+ "loss": 0.0016,
+ "step": 205
+ },
+ {
+ "epoch": 2.8294930875576036,
+ "grad_norm": 0.7636306285858154,
+ "learning_rate": 3.844357265747531e-06,
+ "loss": 0.0018,
+ "step": 206
+ },
+ {
+ "epoch": 2.8433179723502304,
+ "grad_norm": 0.2242082804441452,
+ "learning_rate": 3.8243523110706736e-06,
+ "loss": 0.0021,
+ "step": 207
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.3264133334159851,
+ "learning_rate": 3.8042287730012117e-06,
+ "loss": 0.0021,
+ "step": 208
+ },
+ {
+ "epoch": 2.870967741935484,
+ "grad_norm": 0.12472204118967056,
+ "learning_rate": 3.7839884534142157e-06,
+ "loss": 0.0011,
+ "step": 209
+ },
+ {
+ "epoch": 2.8847926267281108,
+ "grad_norm": 0.07526414096355438,
+ "learning_rate": 3.7636331646414524e-06,
+ "loss": 0.0017,
+ "step": 210
+ },
+ {
+ "epoch": 2.8986175115207375,
+ "grad_norm": 0.16134843230247498,
+ "learning_rate": 3.7431647293091076e-06,
+ "loss": 0.0019,
+ "step": 211
+ },
+ {
+ "epoch": 2.912442396313364,
+ "grad_norm": 0.14789307117462158,
+ "learning_rate": 3.7225849801745835e-06,
+ "loss": 0.0012,
+ "step": 212
+ },
+ {
+ "epoch": 2.9262672811059907,
+ "grad_norm": 0.13681238889694214,
+ "learning_rate": 3.701895759962397e-06,
+ "loss": 0.0011,
+ "step": 213
+ },
+ {
+ "epoch": 2.9400921658986174,
+ "grad_norm": 0.10747735947370529,
+ "learning_rate": 3.6810989211991777e-06,
+ "loss": 0.0007,
+ "step": 214
+ },
+ {
+ "epoch": 2.953917050691244,
+ "grad_norm": 0.08121375739574432,
+ "learning_rate": 3.6601963260477923e-06,
+ "loss": 0.0005,
+ "step": 215
+ },
+ {
+ "epoch": 2.967741935483871,
+ "grad_norm": 0.0884300246834755,
+ "learning_rate": 3.6391898461406045e-06,
+ "loss": 0.0014,
+ "step": 216
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 432,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 72,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.3809900078187414e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-216/training_args.bin b/checkpoint-216/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..48c70fd554062e31c1333fa196fcbd6a4f178c6c
--- /dev/null
+++ b/checkpoint-216/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1d334ff15891d240486e54641859af8de96cab43a64ef7bf9dc417387365ae5
+size 7928
diff --git a/checkpoint-216/zero_to_fp32.py b/checkpoint-216/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-216/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-288/README.md b/checkpoint-288/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fdf619c317c2fe82074662582dbd68166b6f9d50
--- /dev/null
+++ b/checkpoint-288/README.md
@@ -0,0 +1,202 @@
+---
+base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-288/adapter_config.json b/checkpoint-288/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3abb5d68d20446d2b99ace226d6233a68590205a
--- /dev/null
+++ b/checkpoint-288/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "up_proj",
+ "down_proj",
+ "gate_proj",
+ "v_proj",
+ "k_proj",
+ "q_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-288/adapter_model.safetensors b/checkpoint-288/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e2e6b8ced2269a0d6d4056a5a8da46fa5df8af18
--- /dev/null
+++ b/checkpoint-288/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17c10615ae338430e75fa3f2078230db3d94b49d56be711c82f521d2044d269b
+size 10829849744
diff --git a/checkpoint-288/global_step286/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-288/global_step286/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2ee4d7dffb1bb201450a97fce4c98fad48a5e6e2
--- /dev/null
+++ b/checkpoint-288/global_step286/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4614f1ef93917deb902779b7834543605a12b1dd67246dc1ba65553feb0e8ecf
+size 21659418140
diff --git a/checkpoint-288/global_step286/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-288/global_step286/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..079ee5ba8a8e717643340a05c9495557af1be0fc
--- /dev/null
+++ b/checkpoint-288/global_step286/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:146378ec6de3b728a0ee339a3b0fe1d2e7f83a0117b3eb377b10dd103386b513
+size 21659457372
diff --git a/checkpoint-288/global_step286/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-288/global_step286/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a4ea32f67afa2900e459933f81d0a2ce389a871f
--- /dev/null
+++ b/checkpoint-288/global_step286/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6933ae1ba86c3ceb587a0f849b09ab02f12706ff86ffff95f1ea94d24b411fd3
+size 21659417820
diff --git a/checkpoint-288/global_step286/mp_rank_00_model_states.pt b/checkpoint-288/global_step286/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..378aa2c9893febbf113a69a9eedfc60a215c8506
--- /dev/null
+++ b/checkpoint-288/global_step286/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73de910886cfbfa8a043b447b500fc39a4097fd6562d41db7308fd68cbf6c082
+size 11918643933
diff --git a/checkpoint-288/latest b/checkpoint-288/latest
new file mode 100644
index 0000000000000000000000000000000000000000..d39b7b89e4c7ece066f462dc46df67da65d1810e
--- /dev/null
+++ b/checkpoint-288/latest
@@ -0,0 +1 @@
+global_step286
\ No newline at end of file
diff --git a/checkpoint-288/rng_state_0.pth b/checkpoint-288/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..51096e4dafd3a8bfeb6752f6ae421bb3e6fb942b
--- /dev/null
+++ b/checkpoint-288/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c97e3335cd6fbb20b76a202cb002eb217c1982b611bc1a714282da4176c8f5c
+size 14768
diff --git a/checkpoint-288/rng_state_1.pth b/checkpoint-288/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..44511aabee726de376a223133282d6b368dbef19
--- /dev/null
+++ b/checkpoint-288/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c91130d325023d767cdd46255e1fd9b3e83d624f256c07f9c9c131849abfdec3
+size 14768
diff --git a/checkpoint-288/rng_state_2.pth b/checkpoint-288/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..5dbeee29f2f0eca8b7f1789b62e96bd1de8e1772
--- /dev/null
+++ b/checkpoint-288/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b246bd578c3e3ae237e25a547bf3d85d3c85f96c12dfade914140855e2d1bec0
+size 14768
diff --git a/checkpoint-288/scheduler.pt b/checkpoint-288/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7628c0024597416a8211ffbd5a68418d83a678eb
--- /dev/null
+++ b/checkpoint-288/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b69b4d3a079847fc3286aa16b458c66149593b0a314e964b178b2d2904a96b7b
+size 1064
diff --git a/checkpoint-288/special_tokens_map.json b/checkpoint-288/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-288/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-288/tokenizer.json b/checkpoint-288/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-288/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-288/tokenizer_config.json b/checkpoint-288/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdde94c29816839ec3c29d6c6461206a49911f3c
--- /dev/null
+++ b/checkpoint-288/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-288/trainer_state.json b/checkpoint-288/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d7af5a676597af462dcbff5f3cee84d291bae9b
--- /dev/null
+++ b/checkpoint-288/trainer_state.json
@@ -0,0 +1,2049 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.953917050691244,
+ "eval_steps": 500,
+ "global_step": 288,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.013824884792626729,
+ "grad_norm": 34.963134765625,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 2.5476,
+ "step": 1
+ },
+ {
+ "epoch": 0.027649769585253458,
+ "grad_norm": 35.32600021362305,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 2.6058,
+ "step": 2
+ },
+ {
+ "epoch": 0.041474654377880185,
+ "grad_norm": 34.955448150634766,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 2.5871,
+ "step": 3
+ },
+ {
+ "epoch": 0.055299539170506916,
+ "grad_norm": 35.09806442260742,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 2.5912,
+ "step": 4
+ },
+ {
+ "epoch": 0.06912442396313365,
+ "grad_norm": 34.88739776611328,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 2.592,
+ "step": 5
+ },
+ {
+ "epoch": 0.08294930875576037,
+ "grad_norm": 34.84288024902344,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 2.5609,
+ "step": 6
+ },
+ {
+ "epoch": 0.0967741935483871,
+ "grad_norm": 35.0090217590332,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 2.5651,
+ "step": 7
+ },
+ {
+ "epoch": 0.11059907834101383,
+ "grad_norm": 35.03983688354492,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 2.5437,
+ "step": 8
+ },
+ {
+ "epoch": 0.12442396313364056,
+ "grad_norm": 34.802833557128906,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 2.5448,
+ "step": 9
+ },
+ {
+ "epoch": 0.1382488479262673,
+ "grad_norm": 34.5220947265625,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 2.504,
+ "step": 10
+ },
+ {
+ "epoch": 0.15207373271889402,
+ "grad_norm": 34.401580810546875,
+ "learning_rate": 5.5e-07,
+ "loss": 2.4814,
+ "step": 11
+ },
+ {
+ "epoch": 0.16589861751152074,
+ "grad_norm": 33.76997375488281,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 2.4282,
+ "step": 12
+ },
+ {
+ "epoch": 0.17972350230414746,
+ "grad_norm": 33.53415298461914,
+ "learning_rate": 6.5e-07,
+ "loss": 2.4216,
+ "step": 13
+ },
+ {
+ "epoch": 0.1935483870967742,
+ "grad_norm": 32.401580810546875,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 2.3362,
+ "step": 14
+ },
+ {
+ "epoch": 0.2073732718894009,
+ "grad_norm": 33.636661529541016,
+ "learning_rate": 7.5e-07,
+ "loss": 2.2978,
+ "step": 15
+ },
+ {
+ "epoch": 0.22119815668202766,
+ "grad_norm": 31.3782901763916,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 2.1358,
+ "step": 16
+ },
+ {
+ "epoch": 0.2350230414746544,
+ "grad_norm": 30.72391700744629,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 2.0652,
+ "step": 17
+ },
+ {
+ "epoch": 0.2488479262672811,
+ "grad_norm": 30.817584991455078,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 2.0115,
+ "step": 18
+ },
+ {
+ "epoch": 0.2626728110599078,
+ "grad_norm": 29.683996200561523,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 1.8668,
+ "step": 19
+ },
+ {
+ "epoch": 0.2764976958525346,
+ "grad_norm": 29.506683349609375,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.7796,
+ "step": 20
+ },
+ {
+ "epoch": 0.2903225806451613,
+ "grad_norm": 27.55340003967285,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 1.5656,
+ "step": 21
+ },
+ {
+ "epoch": 0.30414746543778803,
+ "grad_norm": 27.78036880493164,
+ "learning_rate": 1.1e-06,
+ "loss": 1.5112,
+ "step": 22
+ },
+ {
+ "epoch": 0.31797235023041476,
+ "grad_norm": 26.36115264892578,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 1.3283,
+ "step": 23
+ },
+ {
+ "epoch": 0.3317972350230415,
+ "grad_norm": 25.388761520385742,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 1.137,
+ "step": 24
+ },
+ {
+ "epoch": 0.3456221198156682,
+ "grad_norm": 25.21432876586914,
+ "learning_rate": 1.25e-06,
+ "loss": 0.9867,
+ "step": 25
+ },
+ {
+ "epoch": 0.35944700460829493,
+ "grad_norm": 24.924489974975586,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7122,
+ "step": 26
+ },
+ {
+ "epoch": 0.37327188940092165,
+ "grad_norm": 21.881420135498047,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.4952,
+ "step": 27
+ },
+ {
+ "epoch": 0.3870967741935484,
+ "grad_norm": 17.67154884338379,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.3602,
+ "step": 28
+ },
+ {
+ "epoch": 0.4009216589861751,
+ "grad_norm": 11.489490509033203,
+ "learning_rate": 1.45e-06,
+ "loss": 0.2432,
+ "step": 29
+ },
+ {
+ "epoch": 0.4147465437788018,
+ "grad_norm": 7.622438907623291,
+ "learning_rate": 1.5e-06,
+ "loss": 0.189,
+ "step": 30
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 4.340638637542725,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.1302,
+ "step": 31
+ },
+ {
+ "epoch": 0.4423963133640553,
+ "grad_norm": 3.079514980316162,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.1075,
+ "step": 32
+ },
+ {
+ "epoch": 0.45622119815668205,
+ "grad_norm": 2.355943441390991,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.0998,
+ "step": 33
+ },
+ {
+ "epoch": 0.4700460829493088,
+ "grad_norm": 1.9480725526809692,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.0926,
+ "step": 34
+ },
+ {
+ "epoch": 0.4838709677419355,
+ "grad_norm": 1.8598166704177856,
+ "learning_rate": 1.75e-06,
+ "loss": 0.0733,
+ "step": 35
+ },
+ {
+ "epoch": 0.4976958525345622,
+ "grad_norm": 0.9892730712890625,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.0664,
+ "step": 36
+ },
+ {
+ "epoch": 0.511520737327189,
+ "grad_norm": 0.8992418050765991,
+ "learning_rate": 1.85e-06,
+ "loss": 0.0709,
+ "step": 37
+ },
+ {
+ "epoch": 0.5253456221198156,
+ "grad_norm": 0.7340101599693298,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.0535,
+ "step": 38
+ },
+ {
+ "epoch": 0.5391705069124424,
+ "grad_norm": 0.7032178044319153,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.0573,
+ "step": 39
+ },
+ {
+ "epoch": 0.5529953917050692,
+ "grad_norm": 0.6449429392814636,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.0576,
+ "step": 40
+ },
+ {
+ "epoch": 0.5668202764976958,
+ "grad_norm": 0.6358592510223389,
+ "learning_rate": 2.05e-06,
+ "loss": 0.0502,
+ "step": 41
+ },
+ {
+ "epoch": 0.5806451612903226,
+ "grad_norm": 0.572036623954773,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.0556,
+ "step": 42
+ },
+ {
+ "epoch": 0.5944700460829493,
+ "grad_norm": 0.6538863778114319,
+ "learning_rate": 2.15e-06,
+ "loss": 0.0556,
+ "step": 43
+ },
+ {
+ "epoch": 0.6082949308755761,
+ "grad_norm": 0.3532159626483917,
+ "learning_rate": 2.2e-06,
+ "loss": 0.0452,
+ "step": 44
+ },
+ {
+ "epoch": 0.6221198156682027,
+ "grad_norm": 0.4853012263774872,
+ "learning_rate": 2.25e-06,
+ "loss": 0.0471,
+ "step": 45
+ },
+ {
+ "epoch": 0.6359447004608295,
+ "grad_norm": 0.4761648178100586,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.0469,
+ "step": 46
+ },
+ {
+ "epoch": 0.6497695852534562,
+ "grad_norm": 0.6094638109207153,
+ "learning_rate": 2.35e-06,
+ "loss": 0.047,
+ "step": 47
+ },
+ {
+ "epoch": 0.663594470046083,
+ "grad_norm": 0.5211306214332581,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.0402,
+ "step": 48
+ },
+ {
+ "epoch": 0.6774193548387096,
+ "grad_norm": 0.2997778356075287,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.0425,
+ "step": 49
+ },
+ {
+ "epoch": 0.6912442396313364,
+ "grad_norm": 0.37834689021110535,
+ "learning_rate": 2.5e-06,
+ "loss": 0.0446,
+ "step": 50
+ },
+ {
+ "epoch": 0.7050691244239631,
+ "grad_norm": 0.31011995673179626,
+ "learning_rate": 2.55e-06,
+ "loss": 0.0406,
+ "step": 51
+ },
+ {
+ "epoch": 0.7188940092165899,
+ "grad_norm": 0.3113131523132324,
+ "learning_rate": 2.6e-06,
+ "loss": 0.0368,
+ "step": 52
+ },
+ {
+ "epoch": 0.7327188940092166,
+ "grad_norm": 0.5685846209526062,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.0389,
+ "step": 53
+ },
+ {
+ "epoch": 0.7465437788018433,
+ "grad_norm": 0.29334983229637146,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.0423,
+ "step": 54
+ },
+ {
+ "epoch": 0.7603686635944701,
+ "grad_norm": 0.5776861906051636,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.0399,
+ "step": 55
+ },
+ {
+ "epoch": 0.7741935483870968,
+ "grad_norm": 0.35423165559768677,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.0357,
+ "step": 56
+ },
+ {
+ "epoch": 0.7880184331797235,
+ "grad_norm": 0.37902742624282837,
+ "learning_rate": 2.85e-06,
+ "loss": 0.0407,
+ "step": 57
+ },
+ {
+ "epoch": 0.8018433179723502,
+ "grad_norm": 0.26948878169059753,
+ "learning_rate": 2.9e-06,
+ "loss": 0.0351,
+ "step": 58
+ },
+ {
+ "epoch": 0.815668202764977,
+ "grad_norm": 0.35688117146492004,
+ "learning_rate": 2.95e-06,
+ "loss": 0.0377,
+ "step": 59
+ },
+ {
+ "epoch": 0.8294930875576036,
+ "grad_norm": 0.5287911891937256,
+ "learning_rate": 3e-06,
+ "loss": 0.0377,
+ "step": 60
+ },
+ {
+ "epoch": 0.8433179723502304,
+ "grad_norm": 0.2950785756111145,
+ "learning_rate": 3.05e-06,
+ "loss": 0.0361,
+ "step": 61
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 0.2789723575115204,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.032,
+ "step": 62
+ },
+ {
+ "epoch": 0.8709677419354839,
+ "grad_norm": 0.2802198529243469,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.0394,
+ "step": 63
+ },
+ {
+ "epoch": 0.8847926267281107,
+ "grad_norm": 0.286981463432312,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.033,
+ "step": 64
+ },
+ {
+ "epoch": 0.8986175115207373,
+ "grad_norm": 0.37392762303352356,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.0335,
+ "step": 65
+ },
+ {
+ "epoch": 0.9124423963133641,
+ "grad_norm": 0.25025588274002075,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.0311,
+ "step": 66
+ },
+ {
+ "epoch": 0.9262672811059908,
+ "grad_norm": 0.4292861521244049,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.0362,
+ "step": 67
+ },
+ {
+ "epoch": 0.9400921658986175,
+ "grad_norm": 0.4717651307582855,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.0303,
+ "step": 68
+ },
+ {
+ "epoch": 0.9539170506912442,
+ "grad_norm": 0.49291253089904785,
+ "learning_rate": 3.45e-06,
+ "loss": 0.0352,
+ "step": 69
+ },
+ {
+ "epoch": 0.967741935483871,
+ "grad_norm": 0.3729935586452484,
+ "learning_rate": 3.5e-06,
+ "loss": 0.0297,
+ "step": 70
+ },
+ {
+ "epoch": 0.9815668202764977,
+ "grad_norm": 0.27150583267211914,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.0326,
+ "step": 71
+ },
+ {
+ "epoch": 0.9953917050691244,
+ "grad_norm": 0.34516096115112305,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.0336,
+ "step": 72
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 0.34516096115112305,
+ "learning_rate": 3.65e-06,
+ "loss": 0.0274,
+ "step": 73
+ },
+ {
+ "epoch": 1.0138248847926268,
+ "grad_norm": 0.6282734870910645,
+ "learning_rate": 3.7e-06,
+ "loss": 0.0289,
+ "step": 74
+ },
+ {
+ "epoch": 1.0276497695852536,
+ "grad_norm": 0.2935558557510376,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.0308,
+ "step": 75
+ },
+ {
+ "epoch": 1.0414746543778801,
+ "grad_norm": 0.3166769742965698,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.0277,
+ "step": 76
+ },
+ {
+ "epoch": 1.055299539170507,
+ "grad_norm": 0.38190239667892456,
+ "learning_rate": 3.85e-06,
+ "loss": 0.0338,
+ "step": 77
+ },
+ {
+ "epoch": 1.0691244239631337,
+ "grad_norm": 0.2779421806335449,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.03,
+ "step": 78
+ },
+ {
+ "epoch": 1.0829493087557605,
+ "grad_norm": 0.4055996537208557,
+ "learning_rate": 3.95e-06,
+ "loss": 0.0295,
+ "step": 79
+ },
+ {
+ "epoch": 1.096774193548387,
+ "grad_norm": 0.2987312972545624,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.028,
+ "step": 80
+ },
+ {
+ "epoch": 1.1105990783410138,
+ "grad_norm": 0.2674776017665863,
+ "learning_rate": 4.05e-06,
+ "loss": 0.0243,
+ "step": 81
+ },
+ {
+ "epoch": 1.1244239631336406,
+ "grad_norm": 0.29042816162109375,
+ "learning_rate": 4.1e-06,
+ "loss": 0.0318,
+ "step": 82
+ },
+ {
+ "epoch": 1.1382488479262673,
+ "grad_norm": 0.2904883027076721,
+ "learning_rate": 4.15e-06,
+ "loss": 0.0257,
+ "step": 83
+ },
+ {
+ "epoch": 1.1520737327188941,
+ "grad_norm": 0.30603015422821045,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.0284,
+ "step": 84
+ },
+ {
+ "epoch": 1.1658986175115207,
+ "grad_norm": 0.23131045699119568,
+ "learning_rate": 4.25e-06,
+ "loss": 0.0285,
+ "step": 85
+ },
+ {
+ "epoch": 1.1797235023041475,
+ "grad_norm": 0.26788002252578735,
+ "learning_rate": 4.3e-06,
+ "loss": 0.0269,
+ "step": 86
+ },
+ {
+ "epoch": 1.1935483870967742,
+ "grad_norm": 0.2639651894569397,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.0289,
+ "step": 87
+ },
+ {
+ "epoch": 1.2073732718894008,
+ "grad_norm": 0.25068584084510803,
+ "learning_rate": 4.4e-06,
+ "loss": 0.0275,
+ "step": 88
+ },
+ {
+ "epoch": 1.2211981566820276,
+ "grad_norm": 0.25494542717933655,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.0275,
+ "step": 89
+ },
+ {
+ "epoch": 1.2350230414746544,
+ "grad_norm": 0.31125035881996155,
+ "learning_rate": 4.5e-06,
+ "loss": 0.0251,
+ "step": 90
+ },
+ {
+ "epoch": 1.2488479262672811,
+ "grad_norm": 0.2691773474216461,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.0267,
+ "step": 91
+ },
+ {
+ "epoch": 1.262672811059908,
+ "grad_norm": 0.20079147815704346,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.0263,
+ "step": 92
+ },
+ {
+ "epoch": 1.2764976958525347,
+ "grad_norm": 0.28027331829071045,
+ "learning_rate": 4.65e-06,
+ "loss": 0.0227,
+ "step": 93
+ },
+ {
+ "epoch": 1.2903225806451613,
+ "grad_norm": 0.40053099393844604,
+ "learning_rate": 4.7e-06,
+ "loss": 0.0246,
+ "step": 94
+ },
+ {
+ "epoch": 1.304147465437788,
+ "grad_norm": 0.33066362142562866,
+ "learning_rate": 4.75e-06,
+ "loss": 0.0221,
+ "step": 95
+ },
+ {
+ "epoch": 1.3179723502304148,
+ "grad_norm": 0.2531339228153229,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.0216,
+ "step": 96
+ },
+ {
+ "epoch": 1.3317972350230414,
+ "grad_norm": 0.37544378638267517,
+ "learning_rate": 4.85e-06,
+ "loss": 0.0247,
+ "step": 97
+ },
+ {
+ "epoch": 1.3456221198156681,
+ "grad_norm": 0.34273672103881836,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.0217,
+ "step": 98
+ },
+ {
+ "epoch": 1.359447004608295,
+ "grad_norm": 0.2338661253452301,
+ "learning_rate": 4.95e-06,
+ "loss": 0.0237,
+ "step": 99
+ },
+ {
+ "epoch": 1.3732718894009217,
+ "grad_norm": 0.30151981115341187,
+ "learning_rate": 5e-06,
+ "loss": 0.0248,
+ "step": 100
+ },
+ {
+ "epoch": 1.3870967741935485,
+ "grad_norm": 0.3205336630344391,
+ "learning_rate": 4.999888074163108e-06,
+ "loss": 0.0232,
+ "step": 101
+ },
+ {
+ "epoch": 1.400921658986175,
+ "grad_norm": 0.2705315351486206,
+ "learning_rate": 4.999552306674345e-06,
+ "loss": 0.0245,
+ "step": 102
+ },
+ {
+ "epoch": 1.4147465437788018,
+ "grad_norm": 0.2564137578010559,
+ "learning_rate": 4.998992727598557e-06,
+ "loss": 0.0274,
+ "step": 103
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.1967611312866211,
+ "learning_rate": 4.998209387040829e-06,
+ "loss": 0.0173,
+ "step": 104
+ },
+ {
+ "epoch": 1.4423963133640554,
+ "grad_norm": 0.2568240761756897,
+ "learning_rate": 4.9972023551419995e-06,
+ "loss": 0.0223,
+ "step": 105
+ },
+ {
+ "epoch": 1.456221198156682,
+ "grad_norm": 0.2236352413892746,
+ "learning_rate": 4.995971722072379e-06,
+ "loss": 0.0202,
+ "step": 106
+ },
+ {
+ "epoch": 1.4700460829493087,
+ "grad_norm": 0.3389627933502197,
+ "learning_rate": 4.9945175980236745e-06,
+ "loss": 0.0214,
+ "step": 107
+ },
+ {
+ "epoch": 1.4838709677419355,
+ "grad_norm": 0.31428012251853943,
+ "learning_rate": 4.992840113199131e-06,
+ "loss": 0.0188,
+ "step": 108
+ },
+ {
+ "epoch": 1.4976958525345623,
+ "grad_norm": 0.41508516669273376,
+ "learning_rate": 4.990939417801859e-06,
+ "loss": 0.0213,
+ "step": 109
+ },
+ {
+ "epoch": 1.511520737327189,
+ "grad_norm": 0.19615545868873596,
+ "learning_rate": 4.988815682021398e-06,
+ "loss": 0.0191,
+ "step": 110
+ },
+ {
+ "epoch": 1.5253456221198156,
+ "grad_norm": 0.2059931755065918,
+ "learning_rate": 4.986469096018472e-06,
+ "loss": 0.0208,
+ "step": 111
+ },
+ {
+ "epoch": 1.5391705069124424,
+ "grad_norm": 0.26946336030960083,
+ "learning_rate": 4.983899869907963e-06,
+ "loss": 0.0192,
+ "step": 112
+ },
+ {
+ "epoch": 1.5529953917050692,
+ "grad_norm": 0.3227538466453552,
+ "learning_rate": 4.981108233740096e-06,
+ "loss": 0.0169,
+ "step": 113
+ },
+ {
+ "epoch": 1.5668202764976957,
+ "grad_norm": 0.2811918258666992,
+ "learning_rate": 4.978094437479843e-06,
+ "loss": 0.0151,
+ "step": 114
+ },
+ {
+ "epoch": 1.5806451612903225,
+ "grad_norm": 0.32980477809906006,
+ "learning_rate": 4.97485875098454e-06,
+ "loss": 0.0182,
+ "step": 115
+ },
+ {
+ "epoch": 1.5944700460829493,
+ "grad_norm": 0.2759259045124054,
+ "learning_rate": 4.971401463979722e-06,
+ "loss": 0.0192,
+ "step": 116
+ },
+ {
+ "epoch": 1.608294930875576,
+ "grad_norm": 0.2572178840637207,
+ "learning_rate": 4.967722886033181e-06,
+ "loss": 0.0198,
+ "step": 117
+ },
+ {
+ "epoch": 1.6221198156682028,
+ "grad_norm": 0.3238658905029297,
+ "learning_rate": 4.963823346527249e-06,
+ "loss": 0.0186,
+ "step": 118
+ },
+ {
+ "epoch": 1.6359447004608296,
+ "grad_norm": 0.3834918737411499,
+ "learning_rate": 4.959703194629304e-06,
+ "loss": 0.0188,
+ "step": 119
+ },
+ {
+ "epoch": 1.6497695852534562,
+ "grad_norm": 0.23881244659423828,
+ "learning_rate": 4.955362799260507e-06,
+ "loss": 0.0182,
+ "step": 120
+ },
+ {
+ "epoch": 1.663594470046083,
+ "grad_norm": 0.1885918825864792,
+ "learning_rate": 4.950802549062764e-06,
+ "loss": 0.0183,
+ "step": 121
+ },
+ {
+ "epoch": 1.6774193548387095,
+ "grad_norm": 0.34959614276885986,
+ "learning_rate": 4.946022852363932e-06,
+ "loss": 0.0173,
+ "step": 122
+ },
+ {
+ "epoch": 1.6912442396313363,
+ "grad_norm": 0.22990310192108154,
+ "learning_rate": 4.9410241371412525e-06,
+ "loss": 0.0135,
+ "step": 123
+ },
+ {
+ "epoch": 1.705069124423963,
+ "grad_norm": 0.2790350615978241,
+ "learning_rate": 4.935806850983034e-06,
+ "loss": 0.0159,
+ "step": 124
+ },
+ {
+ "epoch": 1.7188940092165899,
+ "grad_norm": 0.3218020796775818,
+ "learning_rate": 4.9303714610485705e-06,
+ "loss": 0.0176,
+ "step": 125
+ },
+ {
+ "epoch": 1.7327188940092166,
+ "grad_norm": 0.2294609695672989,
+ "learning_rate": 4.924718454026318e-06,
+ "loss": 0.0149,
+ "step": 126
+ },
+ {
+ "epoch": 1.7465437788018434,
+ "grad_norm": 0.3427927494049072,
+ "learning_rate": 4.918848336090309e-06,
+ "loss": 0.0165,
+ "step": 127
+ },
+ {
+ "epoch": 1.7603686635944702,
+ "grad_norm": 0.22731825709342957,
+ "learning_rate": 4.912761632854834e-06,
+ "loss": 0.0145,
+ "step": 128
+ },
+ {
+ "epoch": 1.7741935483870968,
+ "grad_norm": 0.35364386439323425,
+ "learning_rate": 4.906458889327375e-06,
+ "loss": 0.0161,
+ "step": 129
+ },
+ {
+ "epoch": 1.7880184331797235,
+ "grad_norm": 0.29476454854011536,
+ "learning_rate": 4.899940669859807e-06,
+ "loss": 0.0154,
+ "step": 130
+ },
+ {
+ "epoch": 1.80184331797235,
+ "grad_norm": 0.28667864203453064,
+ "learning_rate": 4.893207558097867e-06,
+ "loss": 0.0143,
+ "step": 131
+ },
+ {
+ "epoch": 1.8156682027649769,
+ "grad_norm": 0.2731999158859253,
+ "learning_rate": 4.8862601569288885e-06,
+ "loss": 0.0141,
+ "step": 132
+ },
+ {
+ "epoch": 1.8294930875576036,
+ "grad_norm": 0.2670470178127289,
+ "learning_rate": 4.879099088427824e-06,
+ "loss": 0.0131,
+ "step": 133
+ },
+ {
+ "epoch": 1.8433179723502304,
+ "grad_norm": 0.23313525319099426,
+ "learning_rate": 4.871724993801541e-06,
+ "loss": 0.012,
+ "step": 134
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 0.2192607820034027,
+ "learning_rate": 4.864138533331411e-06,
+ "loss": 0.0125,
+ "step": 135
+ },
+ {
+ "epoch": 1.870967741935484,
+ "grad_norm": 0.26603585481643677,
+ "learning_rate": 4.8563403863141825e-06,
+ "loss": 0.0121,
+ "step": 136
+ },
+ {
+ "epoch": 1.8847926267281108,
+ "grad_norm": 0.32500001788139343,
+ "learning_rate": 4.84833125100116e-06,
+ "loss": 0.0116,
+ "step": 137
+ },
+ {
+ "epoch": 1.8986175115207373,
+ "grad_norm": 0.24893291294574738,
+ "learning_rate": 4.840111844535682e-06,
+ "loss": 0.0119,
+ "step": 138
+ },
+ {
+ "epoch": 1.912442396313364,
+ "grad_norm": 0.17670764029026031,
+ "learning_rate": 4.8316829028889076e-06,
+ "loss": 0.0096,
+ "step": 139
+ },
+ {
+ "epoch": 1.9262672811059907,
+ "grad_norm": 0.16747575998306274,
+ "learning_rate": 4.823045180793914e-06,
+ "loss": 0.0113,
+ "step": 140
+ },
+ {
+ "epoch": 1.9400921658986174,
+ "grad_norm": 0.19587458670139313,
+ "learning_rate": 4.8141994516781196e-06,
+ "loss": 0.0111,
+ "step": 141
+ },
+ {
+ "epoch": 1.9539170506912442,
+ "grad_norm": 0.237543985247612,
+ "learning_rate": 4.805146507594034e-06,
+ "loss": 0.0088,
+ "step": 142
+ },
+ {
+ "epoch": 1.967741935483871,
+ "grad_norm": 0.22710399329662323,
+ "learning_rate": 4.7958871591483305e-06,
+ "loss": 0.0085,
+ "step": 143
+ },
+ {
+ "epoch": 1.9815668202764978,
+ "grad_norm": 0.2946629822254181,
+ "learning_rate": 4.786422235429269e-06,
+ "loss": 0.0122,
+ "step": 144
+ },
+ {
+ "epoch": 1.9953917050691246,
+ "grad_norm": 0.2763853371143341,
+ "learning_rate": 4.776752583932455e-06,
+ "loss": 0.0118,
+ "step": 145
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.2763853371143341,
+ "learning_rate": 4.766879070484957e-06,
+ "loss": 0.0078,
+ "step": 146
+ },
+ {
+ "epoch": 2.013824884792627,
+ "grad_norm": 0.2722196877002716,
+ "learning_rate": 4.756802579167781e-06,
+ "loss": 0.0076,
+ "step": 147
+ },
+ {
+ "epoch": 2.0276497695852536,
+ "grad_norm": 0.18556565046310425,
+ "learning_rate": 4.746524012236706e-06,
+ "loss": 0.0091,
+ "step": 148
+ },
+ {
+ "epoch": 2.0414746543778803,
+ "grad_norm": 0.24442361295223236,
+ "learning_rate": 4.736044290041496e-06,
+ "loss": 0.009,
+ "step": 149
+ },
+ {
+ "epoch": 2.055299539170507,
+ "grad_norm": 0.24207571148872375,
+ "learning_rate": 4.725364350943492e-06,
+ "loss": 0.0085,
+ "step": 150
+ },
+ {
+ "epoch": 2.0691244239631335,
+ "grad_norm": 0.18502290546894073,
+ "learning_rate": 4.714485151231593e-06,
+ "loss": 0.0059,
+ "step": 151
+ },
+ {
+ "epoch": 2.0829493087557602,
+ "grad_norm": 0.3010450303554535,
+ "learning_rate": 4.703407665036622e-06,
+ "loss": 0.0071,
+ "step": 152
+ },
+ {
+ "epoch": 2.096774193548387,
+ "grad_norm": 0.23272967338562012,
+ "learning_rate": 4.692132884244113e-06,
+ "loss": 0.0074,
+ "step": 153
+ },
+ {
+ "epoch": 2.110599078341014,
+ "grad_norm": 0.25476181507110596,
+ "learning_rate": 4.680661818405485e-06,
+ "loss": 0.0082,
+ "step": 154
+ },
+ {
+ "epoch": 2.1244239631336406,
+ "grad_norm": 0.24534538388252258,
+ "learning_rate": 4.668995494647653e-06,
+ "loss": 0.0065,
+ "step": 155
+ },
+ {
+ "epoch": 2.1382488479262673,
+ "grad_norm": 0.1642732173204422,
+ "learning_rate": 4.657134957581057e-06,
+ "loss": 0.0054,
+ "step": 156
+ },
+ {
+ "epoch": 2.152073732718894,
+ "grad_norm": 0.21100501716136932,
+ "learning_rate": 4.645081269206128e-06,
+ "loss": 0.0091,
+ "step": 157
+ },
+ {
+ "epoch": 2.165898617511521,
+ "grad_norm": 0.19043587148189545,
+ "learning_rate": 4.632835508818192e-06,
+ "loss": 0.0047,
+ "step": 158
+ },
+ {
+ "epoch": 2.1797235023041477,
+ "grad_norm": 0.1804375797510147,
+ "learning_rate": 4.620398772910833e-06,
+ "loss": 0.0068,
+ "step": 159
+ },
+ {
+ "epoch": 2.193548387096774,
+ "grad_norm": 0.6586657762527466,
+ "learning_rate": 4.607772175077712e-06,
+ "loss": 0.0049,
+ "step": 160
+ },
+ {
+ "epoch": 2.207373271889401,
+ "grad_norm": 0.18181656301021576,
+ "learning_rate": 4.59495684591285e-06,
+ "loss": 0.0071,
+ "step": 161
+ },
+ {
+ "epoch": 2.2211981566820276,
+ "grad_norm": 0.760053813457489,
+ "learning_rate": 4.581953932909403e-06,
+ "loss": 0.0065,
+ "step": 162
+ },
+ {
+ "epoch": 2.2350230414746544,
+ "grad_norm": 0.1935238242149353,
+ "learning_rate": 4.5687646003569055e-06,
+ "loss": 0.0066,
+ "step": 163
+ },
+ {
+ "epoch": 2.248847926267281,
+ "grad_norm": 0.3035024404525757,
+ "learning_rate": 4.555390029237026e-06,
+ "loss": 0.0046,
+ "step": 164
+ },
+ {
+ "epoch": 2.262672811059908,
+ "grad_norm": 0.16596420109272003,
+ "learning_rate": 4.541831417117815e-06,
+ "loss": 0.007,
+ "step": 165
+ },
+ {
+ "epoch": 2.2764976958525347,
+ "grad_norm": 0.2578873336315155,
+ "learning_rate": 4.528089978046481e-06,
+ "loss": 0.0048,
+ "step": 166
+ },
+ {
+ "epoch": 2.2903225806451615,
+ "grad_norm": 1.7751781940460205,
+ "learning_rate": 4.514166942440679e-06,
+ "loss": 0.0041,
+ "step": 167
+ },
+ {
+ "epoch": 2.3041474654377883,
+ "grad_norm": 0.37872445583343506,
+ "learning_rate": 4.5000635569783365e-06,
+ "loss": 0.0045,
+ "step": 168
+ },
+ {
+ "epoch": 2.3179723502304146,
+ "grad_norm": 0.22949594259262085,
+ "learning_rate": 4.4857810844860325e-06,
+ "loss": 0.0071,
+ "step": 169
+ },
+ {
+ "epoch": 2.3317972350230414,
+ "grad_norm": 0.34662699699401855,
+ "learning_rate": 4.471320803825915e-06,
+ "loss": 0.006,
+ "step": 170
+ },
+ {
+ "epoch": 2.345622119815668,
+ "grad_norm": 0.5892661213874817,
+ "learning_rate": 4.4566840097811956e-06,
+ "loss": 0.0055,
+ "step": 171
+ },
+ {
+ "epoch": 2.359447004608295,
+ "grad_norm": 0.18866907060146332,
+ "learning_rate": 4.4418720129402145e-06,
+ "loss": 0.0036,
+ "step": 172
+ },
+ {
+ "epoch": 2.3732718894009217,
+ "grad_norm": 0.1510942429304123,
+ "learning_rate": 4.426886139579083e-06,
+ "loss": 0.0065,
+ "step": 173
+ },
+ {
+ "epoch": 2.3870967741935485,
+ "grad_norm": 0.21291828155517578,
+ "learning_rate": 4.411727731542937e-06,
+ "loss": 0.004,
+ "step": 174
+ },
+ {
+ "epoch": 2.4009216589861753,
+ "grad_norm": 0.18649035692214966,
+ "learning_rate": 4.39639814612578e-06,
+ "loss": 0.0047,
+ "step": 175
+ },
+ {
+ "epoch": 2.4147465437788016,
+ "grad_norm": 0.19008278846740723,
+ "learning_rate": 4.3808987559489536e-06,
+ "loss": 0.0071,
+ "step": 176
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.26282456517219543,
+ "learning_rate": 4.365230948838232e-06,
+ "loss": 0.0044,
+ "step": 177
+ },
+ {
+ "epoch": 2.442396313364055,
+ "grad_norm": 0.2351403385400772,
+ "learning_rate": 4.349396127699552e-06,
+ "loss": 0.0057,
+ "step": 178
+ },
+ {
+ "epoch": 2.456221198156682,
+ "grad_norm": 0.20451441407203674,
+ "learning_rate": 4.3333957103934025e-06,
+ "loss": 0.003,
+ "step": 179
+ },
+ {
+ "epoch": 2.4700460829493087,
+ "grad_norm": 0.22120380401611328,
+ "learning_rate": 4.317231129607859e-06,
+ "loss": 0.0045,
+ "step": 180
+ },
+ {
+ "epoch": 2.4838709677419355,
+ "grad_norm": 0.18543967604637146,
+ "learning_rate": 4.30090383273031e-06,
+ "loss": 0.0062,
+ "step": 181
+ },
+ {
+ "epoch": 2.4976958525345623,
+ "grad_norm": 0.18473730981349945,
+ "learning_rate": 4.2844152817178476e-06,
+ "loss": 0.0052,
+ "step": 182
+ },
+ {
+ "epoch": 2.511520737327189,
+ "grad_norm": 0.21087361872196198,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.0041,
+ "step": 183
+ },
+ {
+ "epoch": 2.525345622119816,
+ "grad_norm": 0.24977360665798187,
+ "learning_rate": 4.2509603371783776e-06,
+ "loss": 0.0053,
+ "step": 184
+ },
+ {
+ "epoch": 2.539170506912442,
+ "grad_norm": 0.19377018511295319,
+ "learning_rate": 4.233996939229502e-06,
+ "loss": 0.0037,
+ "step": 185
+ },
+ {
+ "epoch": 2.5529953917050694,
+ "grad_norm": 0.21130548417568207,
+ "learning_rate": 4.216878278033753e-06,
+ "loss": 0.0031,
+ "step": 186
+ },
+ {
+ "epoch": 2.5668202764976957,
+ "grad_norm": 0.13288047909736633,
+ "learning_rate": 4.199605886407515e-06,
+ "loss": 0.0029,
+ "step": 187
+ },
+ {
+ "epoch": 2.5806451612903225,
+ "grad_norm": 0.15998876094818115,
+ "learning_rate": 4.1821813109322975e-06,
+ "loss": 0.0031,
+ "step": 188
+ },
+ {
+ "epoch": 2.5944700460829493,
+ "grad_norm": 0.19475506246089935,
+ "learning_rate": 4.164606111816256e-06,
+ "loss": 0.0022,
+ "step": 189
+ },
+ {
+ "epoch": 2.608294930875576,
+ "grad_norm": 0.1446300446987152,
+ "learning_rate": 4.146881862754485e-06,
+ "loss": 0.0025,
+ "step": 190
+ },
+ {
+ "epoch": 2.622119815668203,
+ "grad_norm": 0.13051164150238037,
+ "learning_rate": 4.129010150788112e-06,
+ "loss": 0.0019,
+ "step": 191
+ },
+ {
+ "epoch": 2.6359447004608296,
+ "grad_norm": 0.1953984946012497,
+ "learning_rate": 4.110992576162193e-06,
+ "loss": 0.0021,
+ "step": 192
+ },
+ {
+ "epoch": 2.6497695852534564,
+ "grad_norm": 0.23630598187446594,
+ "learning_rate": 4.092830752182423e-06,
+ "loss": 0.002,
+ "step": 193
+ },
+ {
+ "epoch": 2.6635944700460827,
+ "grad_norm": 0.2919062376022339,
+ "learning_rate": 4.074526305070679e-06,
+ "loss": 0.0017,
+ "step": 194
+ },
+ {
+ "epoch": 2.6774193548387095,
+ "grad_norm": 0.22015534341335297,
+ "learning_rate": 4.056080873819412e-06,
+ "loss": 0.0025,
+ "step": 195
+ },
+ {
+ "epoch": 2.6912442396313363,
+ "grad_norm": 0.9449160099029541,
+ "learning_rate": 4.037496110044885e-06,
+ "loss": 0.0024,
+ "step": 196
+ },
+ {
+ "epoch": 2.705069124423963,
+ "grad_norm": 0.25235581398010254,
+ "learning_rate": 4.018773677839289e-06,
+ "loss": 0.0031,
+ "step": 197
+ },
+ {
+ "epoch": 2.71889400921659,
+ "grad_norm": 0.3098089098930359,
+ "learning_rate": 3.999915253621739e-06,
+ "loss": 0.0019,
+ "step": 198
+ },
+ {
+ "epoch": 2.7327188940092166,
+ "grad_norm": 0.19896291196346283,
+ "learning_rate": 3.980922525988167e-06,
+ "loss": 0.0019,
+ "step": 199
+ },
+ {
+ "epoch": 2.7465437788018434,
+ "grad_norm": 0.21136268973350525,
+ "learning_rate": 3.961797195560118e-06,
+ "loss": 0.0031,
+ "step": 200
+ },
+ {
+ "epoch": 2.76036866359447,
+ "grad_norm": 0.2549005150794983,
+ "learning_rate": 3.942540974832486e-06,
+ "loss": 0.0017,
+ "step": 201
+ },
+ {
+ "epoch": 2.774193548387097,
+ "grad_norm": 0.14762410521507263,
+ "learning_rate": 3.9231555880201655e-06,
+ "loss": 0.0022,
+ "step": 202
+ },
+ {
+ "epoch": 2.7880184331797233,
+ "grad_norm": 0.16235944628715515,
+ "learning_rate": 3.903642770903671e-06,
+ "loss": 0.0012,
+ "step": 203
+ },
+ {
+ "epoch": 2.80184331797235,
+ "grad_norm": 0.1506718099117279,
+ "learning_rate": 3.884004270673711e-06,
+ "loss": 0.0015,
+ "step": 204
+ },
+ {
+ "epoch": 2.815668202764977,
+ "grad_norm": 0.10484135895967484,
+ "learning_rate": 3.864241845774746e-06,
+ "loss": 0.0016,
+ "step": 205
+ },
+ {
+ "epoch": 2.8294930875576036,
+ "grad_norm": 0.7636306285858154,
+ "learning_rate": 3.844357265747531e-06,
+ "loss": 0.0018,
+ "step": 206
+ },
+ {
+ "epoch": 2.8433179723502304,
+ "grad_norm": 0.2242082804441452,
+ "learning_rate": 3.8243523110706736e-06,
+ "loss": 0.0021,
+ "step": 207
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.3264133334159851,
+ "learning_rate": 3.8042287730012117e-06,
+ "loss": 0.0021,
+ "step": 208
+ },
+ {
+ "epoch": 2.870967741935484,
+ "grad_norm": 0.12472204118967056,
+ "learning_rate": 3.7839884534142157e-06,
+ "loss": 0.0011,
+ "step": 209
+ },
+ {
+ "epoch": 2.8847926267281108,
+ "grad_norm": 0.07526414096355438,
+ "learning_rate": 3.7636331646414524e-06,
+ "loss": 0.0017,
+ "step": 210
+ },
+ {
+ "epoch": 2.8986175115207375,
+ "grad_norm": 0.16134843230247498,
+ "learning_rate": 3.7431647293091076e-06,
+ "loss": 0.0019,
+ "step": 211
+ },
+ {
+ "epoch": 2.912442396313364,
+ "grad_norm": 0.14789307117462158,
+ "learning_rate": 3.7225849801745835e-06,
+ "loss": 0.0012,
+ "step": 212
+ },
+ {
+ "epoch": 2.9262672811059907,
+ "grad_norm": 0.13681238889694214,
+ "learning_rate": 3.701895759962397e-06,
+ "loss": 0.0011,
+ "step": 213
+ },
+ {
+ "epoch": 2.9400921658986174,
+ "grad_norm": 0.10747735947370529,
+ "learning_rate": 3.6810989211991777e-06,
+ "loss": 0.0007,
+ "step": 214
+ },
+ {
+ "epoch": 2.953917050691244,
+ "grad_norm": 0.08121375739574432,
+ "learning_rate": 3.6601963260477923e-06,
+ "loss": 0.0005,
+ "step": 215
+ },
+ {
+ "epoch": 2.967741935483871,
+ "grad_norm": 0.0884300246834755,
+ "learning_rate": 3.6391898461406045e-06,
+ "loss": 0.0014,
+ "step": 216
+ },
+ {
+ "epoch": 2.9815668202764978,
+ "grad_norm": 0.18539245426654816,
+ "learning_rate": 3.6180813624118898e-06,
+ "loss": 0.002,
+ "step": 217
+ },
+ {
+ "epoch": 2.9953917050691246,
+ "grad_norm": 0.1257522702217102,
+ "learning_rate": 3.5968727649294134e-06,
+ "loss": 0.0015,
+ "step": 218
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 1.2422555685043335,
+ "learning_rate": 3.575565952725193e-06,
+ "loss": 0.0002,
+ "step": 219
+ },
+ {
+ "epoch": 3.013824884792627,
+ "grad_norm": 0.06009506434202194,
+ "learning_rate": 3.55416283362546e-06,
+ "loss": 0.0003,
+ "step": 220
+ },
+ {
+ "epoch": 3.0276497695852536,
+ "grad_norm": 0.0876953974366188,
+ "learning_rate": 3.5326653240798283e-06,
+ "loss": 0.0005,
+ "step": 221
+ },
+ {
+ "epoch": 3.0414746543778803,
+ "grad_norm": 0.7512914538383484,
+ "learning_rate": 3.5110753489896924e-06,
+ "loss": 0.0007,
+ "step": 222
+ },
+ {
+ "epoch": 3.055299539170507,
+ "grad_norm": 0.08451899141073227,
+ "learning_rate": 3.4893948415358803e-06,
+ "loss": 0.0009,
+ "step": 223
+ },
+ {
+ "epoch": 3.0691244239631335,
+ "grad_norm": 0.15445305407047272,
+ "learning_rate": 3.4676257430055438e-06,
+ "loss": 0.0006,
+ "step": 224
+ },
+ {
+ "epoch": 3.0829493087557602,
+ "grad_norm": 0.07909094542264938,
+ "learning_rate": 3.4457700026183378e-06,
+ "loss": 0.0004,
+ "step": 225
+ },
+ {
+ "epoch": 3.096774193548387,
+ "grad_norm": 0.03637247905135155,
+ "learning_rate": 3.4238295773518924e-06,
+ "loss": 0.0003,
+ "step": 226
+ },
+ {
+ "epoch": 3.110599078341014,
+ "grad_norm": 0.203308567404747,
+ "learning_rate": 3.4018064317665745e-06,
+ "loss": 0.0003,
+ "step": 227
+ },
+ {
+ "epoch": 3.1244239631336406,
+ "grad_norm": 0.03239201754331589,
+ "learning_rate": 3.3797025378295826e-06,
+ "loss": 0.0002,
+ "step": 228
+ },
+ {
+ "epoch": 3.1382488479262673,
+ "grad_norm": 0.07106538861989975,
+ "learning_rate": 3.357519874738382e-06,
+ "loss": 0.0004,
+ "step": 229
+ },
+ {
+ "epoch": 3.152073732718894,
+ "grad_norm": 0.048268985003232956,
+ "learning_rate": 3.3352604287434752e-06,
+ "loss": 0.0003,
+ "step": 230
+ },
+ {
+ "epoch": 3.165898617511521,
+ "grad_norm": 0.0841558575630188,
+ "learning_rate": 3.31292619297056e-06,
+ "loss": 0.0003,
+ "step": 231
+ },
+ {
+ "epoch": 3.1797235023041477,
+ "grad_norm": 0.07029678672552109,
+ "learning_rate": 3.29051916724206e-06,
+ "loss": 0.0003,
+ "step": 232
+ },
+ {
+ "epoch": 3.193548387096774,
+ "grad_norm": 0.11369964480400085,
+ "learning_rate": 3.2680413578980623e-06,
+ "loss": 0.0014,
+ "step": 233
+ },
+ {
+ "epoch": 3.207373271889401,
+ "grad_norm": 0.0367964468896389,
+ "learning_rate": 3.245494777616664e-06,
+ "loss": 0.0001,
+ "step": 234
+ },
+ {
+ "epoch": 3.2211981566820276,
+ "grad_norm": 0.13746097683906555,
+ "learning_rate": 3.2228814452337587e-06,
+ "loss": 0.0003,
+ "step": 235
+ },
+ {
+ "epoch": 3.2350230414746544,
+ "grad_norm": 0.09046189486980438,
+ "learning_rate": 3.2002033855622683e-06,
+ "loss": 0.0004,
+ "step": 236
+ },
+ {
+ "epoch": 3.248847926267281,
+ "grad_norm": 0.04587667062878609,
+ "learning_rate": 3.177462629210838e-06,
+ "loss": 0.0002,
+ "step": 237
+ },
+ {
+ "epoch": 3.262672811059908,
+ "grad_norm": 0.11323168128728867,
+ "learning_rate": 3.154661212402017e-06,
+ "loss": 0.0003,
+ "step": 238
+ },
+ {
+ "epoch": 3.2764976958525347,
+ "grad_norm": 0.04728177189826965,
+ "learning_rate": 3.131801176789934e-06,
+ "loss": 0.0002,
+ "step": 239
+ },
+ {
+ "epoch": 3.2903225806451615,
+ "grad_norm": 0.527999997138977,
+ "learning_rate": 3.1088845692774798e-06,
+ "loss": 0.0008,
+ "step": 240
+ },
+ {
+ "epoch": 3.3041474654377883,
+ "grad_norm": 0.026646027341485023,
+ "learning_rate": 3.0859134418330373e-06,
+ "loss": 0.0001,
+ "step": 241
+ },
+ {
+ "epoch": 3.3179723502304146,
+ "grad_norm": 0.057450197637081146,
+ "learning_rate": 3.0628898513067357e-06,
+ "loss": 0.0004,
+ "step": 242
+ },
+ {
+ "epoch": 3.3317972350230414,
+ "grad_norm": 0.08258494734764099,
+ "learning_rate": 3.0398158592462847e-06,
+ "loss": 0.0005,
+ "step": 243
+ },
+ {
+ "epoch": 3.345622119815668,
+ "grad_norm": 0.01878846250474453,
+ "learning_rate": 3.0166935317123824e-06,
+ "loss": 0.0001,
+ "step": 244
+ },
+ {
+ "epoch": 3.359447004608295,
+ "grad_norm": 0.041918545961380005,
+ "learning_rate": 2.9935249390937184e-06,
+ "loss": 0.0002,
+ "step": 245
+ },
+ {
+ "epoch": 3.3732718894009217,
+ "grad_norm": 0.04018491134047508,
+ "learning_rate": 2.970312155921585e-06,
+ "loss": 0.0002,
+ "step": 246
+ },
+ {
+ "epoch": 3.3870967741935485,
+ "grad_norm": 0.040825020521879196,
+ "learning_rate": 2.9470572606841295e-06,
+ "loss": 0.0002,
+ "step": 247
+ },
+ {
+ "epoch": 3.4009216589861753,
+ "grad_norm": 0.050590481609106064,
+ "learning_rate": 2.9237623356402423e-06,
+ "loss": 0.0002,
+ "step": 248
+ },
+ {
+ "epoch": 3.4147465437788016,
+ "grad_norm": 0.07999978959560394,
+ "learning_rate": 2.900429466633107e-06,
+ "loss": 0.0002,
+ "step": 249
+ },
+ {
+ "epoch": 3.4285714285714284,
+ "grad_norm": 0.02137935161590576,
+ "learning_rate": 2.8770607429034352e-06,
+ "loss": 0.0001,
+ "step": 250
+ },
+ {
+ "epoch": 3.442396313364055,
+ "grad_norm": 0.18967340886592865,
+ "learning_rate": 2.8536582569023964e-06,
+ "loss": 0.0007,
+ "step": 251
+ },
+ {
+ "epoch": 3.456221198156682,
+ "grad_norm": 0.03681226819753647,
+ "learning_rate": 2.8302241041042564e-06,
+ "loss": 0.0001,
+ "step": 252
+ },
+ {
+ "epoch": 3.4700460829493087,
+ "grad_norm": 0.03142761439085007,
+ "learning_rate": 2.8067603828187446e-06,
+ "loss": 0.0001,
+ "step": 253
+ },
+ {
+ "epoch": 3.4838709677419355,
+ "grad_norm": 0.11318890005350113,
+ "learning_rate": 2.7832691940031755e-06,
+ "loss": 0.0005,
+ "step": 254
+ },
+ {
+ "epoch": 3.4976958525345623,
+ "grad_norm": 0.047176819294691086,
+ "learning_rate": 2.759752641074322e-06,
+ "loss": 0.0002,
+ "step": 255
+ },
+ {
+ "epoch": 3.511520737327189,
+ "grad_norm": 0.0642286092042923,
+ "learning_rate": 2.7362128297200784e-06,
+ "loss": 0.0002,
+ "step": 256
+ },
+ {
+ "epoch": 3.525345622119816,
+ "grad_norm": 0.09328105300664902,
+ "learning_rate": 2.712651867710914e-06,
+ "loss": 0.0004,
+ "step": 257
+ },
+ {
+ "epoch": 3.539170506912442,
+ "grad_norm": 0.08150269836187363,
+ "learning_rate": 2.6890718647111424e-06,
+ "loss": 0.0007,
+ "step": 258
+ },
+ {
+ "epoch": 3.5529953917050694,
+ "grad_norm": 0.03366294875741005,
+ "learning_rate": 2.665474932090017e-06,
+ "loss": 0.0001,
+ "step": 259
+ },
+ {
+ "epoch": 3.5668202764976957,
+ "grad_norm": 0.032316725701093674,
+ "learning_rate": 2.6418631827326857e-06,
+ "loss": 0.0001,
+ "step": 260
+ },
+ {
+ "epoch": 3.5806451612903225,
+ "grad_norm": 0.02776617370545864,
+ "learning_rate": 2.6182387308509927e-06,
+ "loss": 0.0001,
+ "step": 261
+ },
+ {
+ "epoch": 3.5944700460829493,
+ "grad_norm": 0.1258484572172165,
+ "learning_rate": 2.5946036917941765e-06,
+ "loss": 0.0003,
+ "step": 262
+ },
+ {
+ "epoch": 3.608294930875576,
+ "grad_norm": 0.04412033408880234,
+ "learning_rate": 2.570960181859458e-06,
+ "loss": 0.0003,
+ "step": 263
+ },
+ {
+ "epoch": 3.622119815668203,
+ "grad_norm": 0.016816483810544014,
+ "learning_rate": 2.547310318102548e-06,
+ "loss": 0.0001,
+ "step": 264
+ },
+ {
+ "epoch": 3.6359447004608296,
+ "grad_norm": 0.028503524139523506,
+ "learning_rate": 2.5236562181480794e-06,
+ "loss": 0.0001,
+ "step": 265
+ },
+ {
+ "epoch": 3.6497695852534564,
+ "grad_norm": 0.03991785645484924,
+ "learning_rate": 2.5e-06,
+ "loss": 0.0002,
+ "step": 266
+ },
+ {
+ "epoch": 3.6635944700460827,
+ "grad_norm": 0.07638856768608093,
+ "learning_rate": 2.4763437818519205e-06,
+ "loss": 0.0002,
+ "step": 267
+ },
+ {
+ "epoch": 3.6774193548387095,
+ "grad_norm": 0.032387226819992065,
+ "learning_rate": 2.4526896818974534e-06,
+ "loss": 0.0002,
+ "step": 268
+ },
+ {
+ "epoch": 3.6912442396313363,
+ "grad_norm": 0.035975128412246704,
+ "learning_rate": 2.429039818140543e-06,
+ "loss": 0.0002,
+ "step": 269
+ },
+ {
+ "epoch": 3.705069124423963,
+ "grad_norm": 0.021173926070332527,
+ "learning_rate": 2.405396308205825e-06,
+ "loss": 0.0001,
+ "step": 270
+ },
+ {
+ "epoch": 3.71889400921659,
+ "grad_norm": 0.005446314811706543,
+ "learning_rate": 2.381761269149009e-06,
+ "loss": 0.0,
+ "step": 271
+ },
+ {
+ "epoch": 3.7327188940092166,
+ "grad_norm": 0.04019308090209961,
+ "learning_rate": 2.358136817267315e-06,
+ "loss": 0.0001,
+ "step": 272
+ },
+ {
+ "epoch": 3.7465437788018434,
+ "grad_norm": 0.0222685057669878,
+ "learning_rate": 2.334525067909983e-06,
+ "loss": 0.0001,
+ "step": 273
+ },
+ {
+ "epoch": 3.76036866359447,
+ "grad_norm": 0.02486710622906685,
+ "learning_rate": 2.3109281352888593e-06,
+ "loss": 0.0001,
+ "step": 274
+ },
+ {
+ "epoch": 3.774193548387097,
+ "grad_norm": 0.01929207146167755,
+ "learning_rate": 2.2873481322890866e-06,
+ "loss": 0.0001,
+ "step": 275
+ },
+ {
+ "epoch": 3.7880184331797233,
+ "grad_norm": 0.010686581023037434,
+ "learning_rate": 2.263787170279922e-06,
+ "loss": 0.0,
+ "step": 276
+ },
+ {
+ "epoch": 3.80184331797235,
+ "grad_norm": 0.04710806906223297,
+ "learning_rate": 2.2402473589256793e-06,
+ "loss": 0.0001,
+ "step": 277
+ },
+ {
+ "epoch": 3.815668202764977,
+ "grad_norm": 0.00774085009470582,
+ "learning_rate": 2.2167308059968258e-06,
+ "loss": 0.0,
+ "step": 278
+ },
+ {
+ "epoch": 3.8294930875576036,
+ "grad_norm": 0.00735470512881875,
+ "learning_rate": 2.193239617181256e-06,
+ "loss": 0.0,
+ "step": 279
+ },
+ {
+ "epoch": 3.8433179723502304,
+ "grad_norm": 0.005572167690843344,
+ "learning_rate": 2.169775895895745e-06,
+ "loss": 0.0,
+ "step": 280
+ },
+ {
+ "epoch": 3.857142857142857,
+ "grad_norm": 0.07026448100805283,
+ "learning_rate": 2.146341743097604e-06,
+ "loss": 0.0004,
+ "step": 281
+ },
+ {
+ "epoch": 3.870967741935484,
+ "grad_norm": 0.03968067839741707,
+ "learning_rate": 2.1229392570965656e-06,
+ "loss": 0.0001,
+ "step": 282
+ },
+ {
+ "epoch": 3.8847926267281108,
+ "grad_norm": 0.002730958629399538,
+ "learning_rate": 2.0995705333668948e-06,
+ "loss": 0.0,
+ "step": 283
+ },
+ {
+ "epoch": 3.8986175115207375,
+ "grad_norm": 0.010703709907829762,
+ "learning_rate": 2.0762376643597586e-06,
+ "loss": 0.0,
+ "step": 284
+ },
+ {
+ "epoch": 3.912442396313364,
+ "grad_norm": 0.03527766093611717,
+ "learning_rate": 2.0529427393158704e-06,
+ "loss": 0.0001,
+ "step": 285
+ },
+ {
+ "epoch": 3.9262672811059907,
+ "grad_norm": 0.03926033526659012,
+ "learning_rate": 2.0296878440784164e-06,
+ "loss": 0.0001,
+ "step": 286
+ },
+ {
+ "epoch": 3.9400921658986174,
+ "grad_norm": 0.007335775997489691,
+ "learning_rate": 2.006475060906283e-06,
+ "loss": 0.0,
+ "step": 287
+ },
+ {
+ "epoch": 3.953917050691244,
+ "grad_norm": 0.005718631204217672,
+ "learning_rate": 1.9833064682876175e-06,
+ "loss": 0.0,
+ "step": 288
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 432,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 72,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 1.8398904142396391e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-288/training_args.bin b/checkpoint-288/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..48c70fd554062e31c1333fa196fcbd6a4f178c6c
--- /dev/null
+++ b/checkpoint-288/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1d334ff15891d240486e54641859af8de96cab43a64ef7bf9dc417387365ae5
+size 7928
diff --git a/checkpoint-288/zero_to_fp32.py b/checkpoint-288/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-288/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-360/README.md b/checkpoint-360/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fdf619c317c2fe82074662582dbd68166b6f9d50
--- /dev/null
+++ b/checkpoint-360/README.md
@@ -0,0 +1,202 @@
+---
+base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-360/adapter_config.json b/checkpoint-360/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3abb5d68d20446d2b99ace226d6233a68590205a
--- /dev/null
+++ b/checkpoint-360/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "up_proj",
+ "down_proj",
+ "gate_proj",
+ "v_proj",
+ "k_proj",
+ "q_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-360/adapter_model.safetensors b/checkpoint-360/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3353bfd71ea6f28ab7363f800effd06eedf65735
--- /dev/null
+++ b/checkpoint-360/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2b91de5ca2a27afeba8b4eb8274e7327bd8161f1994f6aa15ed1df3830b761b
+size 10829849744
diff --git a/checkpoint-360/global_step357/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-360/global_step357/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1d970292de90c7ba0cd38fdd589fe57855a3266b
--- /dev/null
+++ b/checkpoint-360/global_step357/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b219557d37952b8b58e5cbeed4d802ff38504f90f2241c977ff0ad2680fc1c76
+size 21659418140
diff --git a/checkpoint-360/global_step357/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-360/global_step357/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..9afd2471a87d4669f8734b26ae1bbc35eba3123b
--- /dev/null
+++ b/checkpoint-360/global_step357/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a63ba01318a3562d24cf51361e4b1420306a0e99ce67e9fa62558d32c07459e
+size 21659457372
diff --git a/checkpoint-360/global_step357/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-360/global_step357/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..894ab70f76a5d2a8550287c1d0ff671d569bda22
--- /dev/null
+++ b/checkpoint-360/global_step357/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa674905c81e1df2773076e1daeb5acc92bfd1e6c02fa5d10cc2346f6e431a32
+size 21659417820
diff --git a/checkpoint-360/global_step357/mp_rank_00_model_states.pt b/checkpoint-360/global_step357/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..c9658b21131394f3675079f2bd8c6153cf3afce5
--- /dev/null
+++ b/checkpoint-360/global_step357/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:833866dc1ee58dea9693d551c47ea610791710e291ec4556a2830a696848e31d
+size 11918643933
diff --git a/checkpoint-360/latest b/checkpoint-360/latest
new file mode 100644
index 0000000000000000000000000000000000000000..82b5f7ef15b841f6e2bb6d67b0148b2cd3277795
--- /dev/null
+++ b/checkpoint-360/latest
@@ -0,0 +1 @@
+global_step357
\ No newline at end of file
diff --git a/checkpoint-360/rng_state_0.pth b/checkpoint-360/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0bc0ed10ed4931e058d17bfd0fb09d5722495759
--- /dev/null
+++ b/checkpoint-360/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bff760ec5850731a18abe6b8c7e7f6a45c5dd541eaeb048d8066b987e042bcec
+size 14768
diff --git a/checkpoint-360/rng_state_1.pth b/checkpoint-360/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0ccb1c7c216b565574e135e61e9381c4b934bf31
--- /dev/null
+++ b/checkpoint-360/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa125a965cd3501fecf1885f01cd88d194e523f7182e0fe1710a680e091c3d6f
+size 14768
diff --git a/checkpoint-360/rng_state_2.pth b/checkpoint-360/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..54197787343cb5934778d904643b3f1ecb04e999
--- /dev/null
+++ b/checkpoint-360/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e47d86fd08c700a35d1a6c5a138a7c3c26edb31ccc6a09cacdb3d44cc3d2640
+size 14768
diff --git a/checkpoint-360/scheduler.pt b/checkpoint-360/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a9bea00971e15c82f3fef418b45f5772efc6e041
--- /dev/null
+++ b/checkpoint-360/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f12fb5515df7dcf4cc1a3ecda848613d64db62cd311a9066dc53c01e4e1c6a83
+size 1064
diff --git a/checkpoint-360/special_tokens_map.json b/checkpoint-360/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-360/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-360/tokenizer.json b/checkpoint-360/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-360/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-360/tokenizer_config.json b/checkpoint-360/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdde94c29816839ec3c29d6c6461206a49911f3c
--- /dev/null
+++ b/checkpoint-360/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-360/trainer_state.json b/checkpoint-360/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab6b2b14d098630b9f236478485bc800d3db768f
--- /dev/null
+++ b/checkpoint-360/trainer_state.json
@@ -0,0 +1,2553 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 4.940092165898617,
+ "eval_steps": 500,
+ "global_step": 360,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.013824884792626729,
+ "grad_norm": 34.963134765625,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 2.5476,
+ "step": 1
+ },
+ {
+ "epoch": 0.027649769585253458,
+ "grad_norm": 35.32600021362305,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 2.6058,
+ "step": 2
+ },
+ {
+ "epoch": 0.041474654377880185,
+ "grad_norm": 34.955448150634766,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 2.5871,
+ "step": 3
+ },
+ {
+ "epoch": 0.055299539170506916,
+ "grad_norm": 35.09806442260742,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 2.5912,
+ "step": 4
+ },
+ {
+ "epoch": 0.06912442396313365,
+ "grad_norm": 34.88739776611328,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 2.592,
+ "step": 5
+ },
+ {
+ "epoch": 0.08294930875576037,
+ "grad_norm": 34.84288024902344,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 2.5609,
+ "step": 6
+ },
+ {
+ "epoch": 0.0967741935483871,
+ "grad_norm": 35.0090217590332,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 2.5651,
+ "step": 7
+ },
+ {
+ "epoch": 0.11059907834101383,
+ "grad_norm": 35.03983688354492,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 2.5437,
+ "step": 8
+ },
+ {
+ "epoch": 0.12442396313364056,
+ "grad_norm": 34.802833557128906,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 2.5448,
+ "step": 9
+ },
+ {
+ "epoch": 0.1382488479262673,
+ "grad_norm": 34.5220947265625,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 2.504,
+ "step": 10
+ },
+ {
+ "epoch": 0.15207373271889402,
+ "grad_norm": 34.401580810546875,
+ "learning_rate": 5.5e-07,
+ "loss": 2.4814,
+ "step": 11
+ },
+ {
+ "epoch": 0.16589861751152074,
+ "grad_norm": 33.76997375488281,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 2.4282,
+ "step": 12
+ },
+ {
+ "epoch": 0.17972350230414746,
+ "grad_norm": 33.53415298461914,
+ "learning_rate": 6.5e-07,
+ "loss": 2.4216,
+ "step": 13
+ },
+ {
+ "epoch": 0.1935483870967742,
+ "grad_norm": 32.401580810546875,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 2.3362,
+ "step": 14
+ },
+ {
+ "epoch": 0.2073732718894009,
+ "grad_norm": 33.636661529541016,
+ "learning_rate": 7.5e-07,
+ "loss": 2.2978,
+ "step": 15
+ },
+ {
+ "epoch": 0.22119815668202766,
+ "grad_norm": 31.3782901763916,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 2.1358,
+ "step": 16
+ },
+ {
+ "epoch": 0.2350230414746544,
+ "grad_norm": 30.72391700744629,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 2.0652,
+ "step": 17
+ },
+ {
+ "epoch": 0.2488479262672811,
+ "grad_norm": 30.817584991455078,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 2.0115,
+ "step": 18
+ },
+ {
+ "epoch": 0.2626728110599078,
+ "grad_norm": 29.683996200561523,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 1.8668,
+ "step": 19
+ },
+ {
+ "epoch": 0.2764976958525346,
+ "grad_norm": 29.506683349609375,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.7796,
+ "step": 20
+ },
+ {
+ "epoch": 0.2903225806451613,
+ "grad_norm": 27.55340003967285,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 1.5656,
+ "step": 21
+ },
+ {
+ "epoch": 0.30414746543778803,
+ "grad_norm": 27.78036880493164,
+ "learning_rate": 1.1e-06,
+ "loss": 1.5112,
+ "step": 22
+ },
+ {
+ "epoch": 0.31797235023041476,
+ "grad_norm": 26.36115264892578,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 1.3283,
+ "step": 23
+ },
+ {
+ "epoch": 0.3317972350230415,
+ "grad_norm": 25.388761520385742,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 1.137,
+ "step": 24
+ },
+ {
+ "epoch": 0.3456221198156682,
+ "grad_norm": 25.21432876586914,
+ "learning_rate": 1.25e-06,
+ "loss": 0.9867,
+ "step": 25
+ },
+ {
+ "epoch": 0.35944700460829493,
+ "grad_norm": 24.924489974975586,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7122,
+ "step": 26
+ },
+ {
+ "epoch": 0.37327188940092165,
+ "grad_norm": 21.881420135498047,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.4952,
+ "step": 27
+ },
+ {
+ "epoch": 0.3870967741935484,
+ "grad_norm": 17.67154884338379,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.3602,
+ "step": 28
+ },
+ {
+ "epoch": 0.4009216589861751,
+ "grad_norm": 11.489490509033203,
+ "learning_rate": 1.45e-06,
+ "loss": 0.2432,
+ "step": 29
+ },
+ {
+ "epoch": 0.4147465437788018,
+ "grad_norm": 7.622438907623291,
+ "learning_rate": 1.5e-06,
+ "loss": 0.189,
+ "step": 30
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 4.340638637542725,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.1302,
+ "step": 31
+ },
+ {
+ "epoch": 0.4423963133640553,
+ "grad_norm": 3.079514980316162,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.1075,
+ "step": 32
+ },
+ {
+ "epoch": 0.45622119815668205,
+ "grad_norm": 2.355943441390991,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.0998,
+ "step": 33
+ },
+ {
+ "epoch": 0.4700460829493088,
+ "grad_norm": 1.9480725526809692,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.0926,
+ "step": 34
+ },
+ {
+ "epoch": 0.4838709677419355,
+ "grad_norm": 1.8598166704177856,
+ "learning_rate": 1.75e-06,
+ "loss": 0.0733,
+ "step": 35
+ },
+ {
+ "epoch": 0.4976958525345622,
+ "grad_norm": 0.9892730712890625,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.0664,
+ "step": 36
+ },
+ {
+ "epoch": 0.511520737327189,
+ "grad_norm": 0.8992418050765991,
+ "learning_rate": 1.85e-06,
+ "loss": 0.0709,
+ "step": 37
+ },
+ {
+ "epoch": 0.5253456221198156,
+ "grad_norm": 0.7340101599693298,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.0535,
+ "step": 38
+ },
+ {
+ "epoch": 0.5391705069124424,
+ "grad_norm": 0.7032178044319153,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.0573,
+ "step": 39
+ },
+ {
+ "epoch": 0.5529953917050692,
+ "grad_norm": 0.6449429392814636,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.0576,
+ "step": 40
+ },
+ {
+ "epoch": 0.5668202764976958,
+ "grad_norm": 0.6358592510223389,
+ "learning_rate": 2.05e-06,
+ "loss": 0.0502,
+ "step": 41
+ },
+ {
+ "epoch": 0.5806451612903226,
+ "grad_norm": 0.572036623954773,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.0556,
+ "step": 42
+ },
+ {
+ "epoch": 0.5944700460829493,
+ "grad_norm": 0.6538863778114319,
+ "learning_rate": 2.15e-06,
+ "loss": 0.0556,
+ "step": 43
+ },
+ {
+ "epoch": 0.6082949308755761,
+ "grad_norm": 0.3532159626483917,
+ "learning_rate": 2.2e-06,
+ "loss": 0.0452,
+ "step": 44
+ },
+ {
+ "epoch": 0.6221198156682027,
+ "grad_norm": 0.4853012263774872,
+ "learning_rate": 2.25e-06,
+ "loss": 0.0471,
+ "step": 45
+ },
+ {
+ "epoch": 0.6359447004608295,
+ "grad_norm": 0.4761648178100586,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.0469,
+ "step": 46
+ },
+ {
+ "epoch": 0.6497695852534562,
+ "grad_norm": 0.6094638109207153,
+ "learning_rate": 2.35e-06,
+ "loss": 0.047,
+ "step": 47
+ },
+ {
+ "epoch": 0.663594470046083,
+ "grad_norm": 0.5211306214332581,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.0402,
+ "step": 48
+ },
+ {
+ "epoch": 0.6774193548387096,
+ "grad_norm": 0.2997778356075287,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.0425,
+ "step": 49
+ },
+ {
+ "epoch": 0.6912442396313364,
+ "grad_norm": 0.37834689021110535,
+ "learning_rate": 2.5e-06,
+ "loss": 0.0446,
+ "step": 50
+ },
+ {
+ "epoch": 0.7050691244239631,
+ "grad_norm": 0.31011995673179626,
+ "learning_rate": 2.55e-06,
+ "loss": 0.0406,
+ "step": 51
+ },
+ {
+ "epoch": 0.7188940092165899,
+ "grad_norm": 0.3113131523132324,
+ "learning_rate": 2.6e-06,
+ "loss": 0.0368,
+ "step": 52
+ },
+ {
+ "epoch": 0.7327188940092166,
+ "grad_norm": 0.5685846209526062,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.0389,
+ "step": 53
+ },
+ {
+ "epoch": 0.7465437788018433,
+ "grad_norm": 0.29334983229637146,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.0423,
+ "step": 54
+ },
+ {
+ "epoch": 0.7603686635944701,
+ "grad_norm": 0.5776861906051636,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.0399,
+ "step": 55
+ },
+ {
+ "epoch": 0.7741935483870968,
+ "grad_norm": 0.35423165559768677,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.0357,
+ "step": 56
+ },
+ {
+ "epoch": 0.7880184331797235,
+ "grad_norm": 0.37902742624282837,
+ "learning_rate": 2.85e-06,
+ "loss": 0.0407,
+ "step": 57
+ },
+ {
+ "epoch": 0.8018433179723502,
+ "grad_norm": 0.26948878169059753,
+ "learning_rate": 2.9e-06,
+ "loss": 0.0351,
+ "step": 58
+ },
+ {
+ "epoch": 0.815668202764977,
+ "grad_norm": 0.35688117146492004,
+ "learning_rate": 2.95e-06,
+ "loss": 0.0377,
+ "step": 59
+ },
+ {
+ "epoch": 0.8294930875576036,
+ "grad_norm": 0.5287911891937256,
+ "learning_rate": 3e-06,
+ "loss": 0.0377,
+ "step": 60
+ },
+ {
+ "epoch": 0.8433179723502304,
+ "grad_norm": 0.2950785756111145,
+ "learning_rate": 3.05e-06,
+ "loss": 0.0361,
+ "step": 61
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 0.2789723575115204,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.032,
+ "step": 62
+ },
+ {
+ "epoch": 0.8709677419354839,
+ "grad_norm": 0.2802198529243469,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.0394,
+ "step": 63
+ },
+ {
+ "epoch": 0.8847926267281107,
+ "grad_norm": 0.286981463432312,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.033,
+ "step": 64
+ },
+ {
+ "epoch": 0.8986175115207373,
+ "grad_norm": 0.37392762303352356,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.0335,
+ "step": 65
+ },
+ {
+ "epoch": 0.9124423963133641,
+ "grad_norm": 0.25025588274002075,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.0311,
+ "step": 66
+ },
+ {
+ "epoch": 0.9262672811059908,
+ "grad_norm": 0.4292861521244049,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.0362,
+ "step": 67
+ },
+ {
+ "epoch": 0.9400921658986175,
+ "grad_norm": 0.4717651307582855,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.0303,
+ "step": 68
+ },
+ {
+ "epoch": 0.9539170506912442,
+ "grad_norm": 0.49291253089904785,
+ "learning_rate": 3.45e-06,
+ "loss": 0.0352,
+ "step": 69
+ },
+ {
+ "epoch": 0.967741935483871,
+ "grad_norm": 0.3729935586452484,
+ "learning_rate": 3.5e-06,
+ "loss": 0.0297,
+ "step": 70
+ },
+ {
+ "epoch": 0.9815668202764977,
+ "grad_norm": 0.27150583267211914,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.0326,
+ "step": 71
+ },
+ {
+ "epoch": 0.9953917050691244,
+ "grad_norm": 0.34516096115112305,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.0336,
+ "step": 72
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 0.34516096115112305,
+ "learning_rate": 3.65e-06,
+ "loss": 0.0274,
+ "step": 73
+ },
+ {
+ "epoch": 1.0138248847926268,
+ "grad_norm": 0.6282734870910645,
+ "learning_rate": 3.7e-06,
+ "loss": 0.0289,
+ "step": 74
+ },
+ {
+ "epoch": 1.0276497695852536,
+ "grad_norm": 0.2935558557510376,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.0308,
+ "step": 75
+ },
+ {
+ "epoch": 1.0414746543778801,
+ "grad_norm": 0.3166769742965698,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.0277,
+ "step": 76
+ },
+ {
+ "epoch": 1.055299539170507,
+ "grad_norm": 0.38190239667892456,
+ "learning_rate": 3.85e-06,
+ "loss": 0.0338,
+ "step": 77
+ },
+ {
+ "epoch": 1.0691244239631337,
+ "grad_norm": 0.2779421806335449,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.03,
+ "step": 78
+ },
+ {
+ "epoch": 1.0829493087557605,
+ "grad_norm": 0.4055996537208557,
+ "learning_rate": 3.95e-06,
+ "loss": 0.0295,
+ "step": 79
+ },
+ {
+ "epoch": 1.096774193548387,
+ "grad_norm": 0.2987312972545624,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.028,
+ "step": 80
+ },
+ {
+ "epoch": 1.1105990783410138,
+ "grad_norm": 0.2674776017665863,
+ "learning_rate": 4.05e-06,
+ "loss": 0.0243,
+ "step": 81
+ },
+ {
+ "epoch": 1.1244239631336406,
+ "grad_norm": 0.29042816162109375,
+ "learning_rate": 4.1e-06,
+ "loss": 0.0318,
+ "step": 82
+ },
+ {
+ "epoch": 1.1382488479262673,
+ "grad_norm": 0.2904883027076721,
+ "learning_rate": 4.15e-06,
+ "loss": 0.0257,
+ "step": 83
+ },
+ {
+ "epoch": 1.1520737327188941,
+ "grad_norm": 0.30603015422821045,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.0284,
+ "step": 84
+ },
+ {
+ "epoch": 1.1658986175115207,
+ "grad_norm": 0.23131045699119568,
+ "learning_rate": 4.25e-06,
+ "loss": 0.0285,
+ "step": 85
+ },
+ {
+ "epoch": 1.1797235023041475,
+ "grad_norm": 0.26788002252578735,
+ "learning_rate": 4.3e-06,
+ "loss": 0.0269,
+ "step": 86
+ },
+ {
+ "epoch": 1.1935483870967742,
+ "grad_norm": 0.2639651894569397,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.0289,
+ "step": 87
+ },
+ {
+ "epoch": 1.2073732718894008,
+ "grad_norm": 0.25068584084510803,
+ "learning_rate": 4.4e-06,
+ "loss": 0.0275,
+ "step": 88
+ },
+ {
+ "epoch": 1.2211981566820276,
+ "grad_norm": 0.25494542717933655,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.0275,
+ "step": 89
+ },
+ {
+ "epoch": 1.2350230414746544,
+ "grad_norm": 0.31125035881996155,
+ "learning_rate": 4.5e-06,
+ "loss": 0.0251,
+ "step": 90
+ },
+ {
+ "epoch": 1.2488479262672811,
+ "grad_norm": 0.2691773474216461,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.0267,
+ "step": 91
+ },
+ {
+ "epoch": 1.262672811059908,
+ "grad_norm": 0.20079147815704346,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.0263,
+ "step": 92
+ },
+ {
+ "epoch": 1.2764976958525347,
+ "grad_norm": 0.28027331829071045,
+ "learning_rate": 4.65e-06,
+ "loss": 0.0227,
+ "step": 93
+ },
+ {
+ "epoch": 1.2903225806451613,
+ "grad_norm": 0.40053099393844604,
+ "learning_rate": 4.7e-06,
+ "loss": 0.0246,
+ "step": 94
+ },
+ {
+ "epoch": 1.304147465437788,
+ "grad_norm": 0.33066362142562866,
+ "learning_rate": 4.75e-06,
+ "loss": 0.0221,
+ "step": 95
+ },
+ {
+ "epoch": 1.3179723502304148,
+ "grad_norm": 0.2531339228153229,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.0216,
+ "step": 96
+ },
+ {
+ "epoch": 1.3317972350230414,
+ "grad_norm": 0.37544378638267517,
+ "learning_rate": 4.85e-06,
+ "loss": 0.0247,
+ "step": 97
+ },
+ {
+ "epoch": 1.3456221198156681,
+ "grad_norm": 0.34273672103881836,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.0217,
+ "step": 98
+ },
+ {
+ "epoch": 1.359447004608295,
+ "grad_norm": 0.2338661253452301,
+ "learning_rate": 4.95e-06,
+ "loss": 0.0237,
+ "step": 99
+ },
+ {
+ "epoch": 1.3732718894009217,
+ "grad_norm": 0.30151981115341187,
+ "learning_rate": 5e-06,
+ "loss": 0.0248,
+ "step": 100
+ },
+ {
+ "epoch": 1.3870967741935485,
+ "grad_norm": 0.3205336630344391,
+ "learning_rate": 4.999888074163108e-06,
+ "loss": 0.0232,
+ "step": 101
+ },
+ {
+ "epoch": 1.400921658986175,
+ "grad_norm": 0.2705315351486206,
+ "learning_rate": 4.999552306674345e-06,
+ "loss": 0.0245,
+ "step": 102
+ },
+ {
+ "epoch": 1.4147465437788018,
+ "grad_norm": 0.2564137578010559,
+ "learning_rate": 4.998992727598557e-06,
+ "loss": 0.0274,
+ "step": 103
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.1967611312866211,
+ "learning_rate": 4.998209387040829e-06,
+ "loss": 0.0173,
+ "step": 104
+ },
+ {
+ "epoch": 1.4423963133640554,
+ "grad_norm": 0.2568240761756897,
+ "learning_rate": 4.9972023551419995e-06,
+ "loss": 0.0223,
+ "step": 105
+ },
+ {
+ "epoch": 1.456221198156682,
+ "grad_norm": 0.2236352413892746,
+ "learning_rate": 4.995971722072379e-06,
+ "loss": 0.0202,
+ "step": 106
+ },
+ {
+ "epoch": 1.4700460829493087,
+ "grad_norm": 0.3389627933502197,
+ "learning_rate": 4.9945175980236745e-06,
+ "loss": 0.0214,
+ "step": 107
+ },
+ {
+ "epoch": 1.4838709677419355,
+ "grad_norm": 0.31428012251853943,
+ "learning_rate": 4.992840113199131e-06,
+ "loss": 0.0188,
+ "step": 108
+ },
+ {
+ "epoch": 1.4976958525345623,
+ "grad_norm": 0.41508516669273376,
+ "learning_rate": 4.990939417801859e-06,
+ "loss": 0.0213,
+ "step": 109
+ },
+ {
+ "epoch": 1.511520737327189,
+ "grad_norm": 0.19615545868873596,
+ "learning_rate": 4.988815682021398e-06,
+ "loss": 0.0191,
+ "step": 110
+ },
+ {
+ "epoch": 1.5253456221198156,
+ "grad_norm": 0.2059931755065918,
+ "learning_rate": 4.986469096018472e-06,
+ "loss": 0.0208,
+ "step": 111
+ },
+ {
+ "epoch": 1.5391705069124424,
+ "grad_norm": 0.26946336030960083,
+ "learning_rate": 4.983899869907963e-06,
+ "loss": 0.0192,
+ "step": 112
+ },
+ {
+ "epoch": 1.5529953917050692,
+ "grad_norm": 0.3227538466453552,
+ "learning_rate": 4.981108233740096e-06,
+ "loss": 0.0169,
+ "step": 113
+ },
+ {
+ "epoch": 1.5668202764976957,
+ "grad_norm": 0.2811918258666992,
+ "learning_rate": 4.978094437479843e-06,
+ "loss": 0.0151,
+ "step": 114
+ },
+ {
+ "epoch": 1.5806451612903225,
+ "grad_norm": 0.32980477809906006,
+ "learning_rate": 4.97485875098454e-06,
+ "loss": 0.0182,
+ "step": 115
+ },
+ {
+ "epoch": 1.5944700460829493,
+ "grad_norm": 0.2759259045124054,
+ "learning_rate": 4.971401463979722e-06,
+ "loss": 0.0192,
+ "step": 116
+ },
+ {
+ "epoch": 1.608294930875576,
+ "grad_norm": 0.2572178840637207,
+ "learning_rate": 4.967722886033181e-06,
+ "loss": 0.0198,
+ "step": 117
+ },
+ {
+ "epoch": 1.6221198156682028,
+ "grad_norm": 0.3238658905029297,
+ "learning_rate": 4.963823346527249e-06,
+ "loss": 0.0186,
+ "step": 118
+ },
+ {
+ "epoch": 1.6359447004608296,
+ "grad_norm": 0.3834918737411499,
+ "learning_rate": 4.959703194629304e-06,
+ "loss": 0.0188,
+ "step": 119
+ },
+ {
+ "epoch": 1.6497695852534562,
+ "grad_norm": 0.23881244659423828,
+ "learning_rate": 4.955362799260507e-06,
+ "loss": 0.0182,
+ "step": 120
+ },
+ {
+ "epoch": 1.663594470046083,
+ "grad_norm": 0.1885918825864792,
+ "learning_rate": 4.950802549062764e-06,
+ "loss": 0.0183,
+ "step": 121
+ },
+ {
+ "epoch": 1.6774193548387095,
+ "grad_norm": 0.34959614276885986,
+ "learning_rate": 4.946022852363932e-06,
+ "loss": 0.0173,
+ "step": 122
+ },
+ {
+ "epoch": 1.6912442396313363,
+ "grad_norm": 0.22990310192108154,
+ "learning_rate": 4.9410241371412525e-06,
+ "loss": 0.0135,
+ "step": 123
+ },
+ {
+ "epoch": 1.705069124423963,
+ "grad_norm": 0.2790350615978241,
+ "learning_rate": 4.935806850983034e-06,
+ "loss": 0.0159,
+ "step": 124
+ },
+ {
+ "epoch": 1.7188940092165899,
+ "grad_norm": 0.3218020796775818,
+ "learning_rate": 4.9303714610485705e-06,
+ "loss": 0.0176,
+ "step": 125
+ },
+ {
+ "epoch": 1.7327188940092166,
+ "grad_norm": 0.2294609695672989,
+ "learning_rate": 4.924718454026318e-06,
+ "loss": 0.0149,
+ "step": 126
+ },
+ {
+ "epoch": 1.7465437788018434,
+ "grad_norm": 0.3427927494049072,
+ "learning_rate": 4.918848336090309e-06,
+ "loss": 0.0165,
+ "step": 127
+ },
+ {
+ "epoch": 1.7603686635944702,
+ "grad_norm": 0.22731825709342957,
+ "learning_rate": 4.912761632854834e-06,
+ "loss": 0.0145,
+ "step": 128
+ },
+ {
+ "epoch": 1.7741935483870968,
+ "grad_norm": 0.35364386439323425,
+ "learning_rate": 4.906458889327375e-06,
+ "loss": 0.0161,
+ "step": 129
+ },
+ {
+ "epoch": 1.7880184331797235,
+ "grad_norm": 0.29476454854011536,
+ "learning_rate": 4.899940669859807e-06,
+ "loss": 0.0154,
+ "step": 130
+ },
+ {
+ "epoch": 1.80184331797235,
+ "grad_norm": 0.28667864203453064,
+ "learning_rate": 4.893207558097867e-06,
+ "loss": 0.0143,
+ "step": 131
+ },
+ {
+ "epoch": 1.8156682027649769,
+ "grad_norm": 0.2731999158859253,
+ "learning_rate": 4.8862601569288885e-06,
+ "loss": 0.0141,
+ "step": 132
+ },
+ {
+ "epoch": 1.8294930875576036,
+ "grad_norm": 0.2670470178127289,
+ "learning_rate": 4.879099088427824e-06,
+ "loss": 0.0131,
+ "step": 133
+ },
+ {
+ "epoch": 1.8433179723502304,
+ "grad_norm": 0.23313525319099426,
+ "learning_rate": 4.871724993801541e-06,
+ "loss": 0.012,
+ "step": 134
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 0.2192607820034027,
+ "learning_rate": 4.864138533331411e-06,
+ "loss": 0.0125,
+ "step": 135
+ },
+ {
+ "epoch": 1.870967741935484,
+ "grad_norm": 0.26603585481643677,
+ "learning_rate": 4.8563403863141825e-06,
+ "loss": 0.0121,
+ "step": 136
+ },
+ {
+ "epoch": 1.8847926267281108,
+ "grad_norm": 0.32500001788139343,
+ "learning_rate": 4.84833125100116e-06,
+ "loss": 0.0116,
+ "step": 137
+ },
+ {
+ "epoch": 1.8986175115207373,
+ "grad_norm": 0.24893291294574738,
+ "learning_rate": 4.840111844535682e-06,
+ "loss": 0.0119,
+ "step": 138
+ },
+ {
+ "epoch": 1.912442396313364,
+ "grad_norm": 0.17670764029026031,
+ "learning_rate": 4.8316829028889076e-06,
+ "loss": 0.0096,
+ "step": 139
+ },
+ {
+ "epoch": 1.9262672811059907,
+ "grad_norm": 0.16747575998306274,
+ "learning_rate": 4.823045180793914e-06,
+ "loss": 0.0113,
+ "step": 140
+ },
+ {
+ "epoch": 1.9400921658986174,
+ "grad_norm": 0.19587458670139313,
+ "learning_rate": 4.8141994516781196e-06,
+ "loss": 0.0111,
+ "step": 141
+ },
+ {
+ "epoch": 1.9539170506912442,
+ "grad_norm": 0.237543985247612,
+ "learning_rate": 4.805146507594034e-06,
+ "loss": 0.0088,
+ "step": 142
+ },
+ {
+ "epoch": 1.967741935483871,
+ "grad_norm": 0.22710399329662323,
+ "learning_rate": 4.7958871591483305e-06,
+ "loss": 0.0085,
+ "step": 143
+ },
+ {
+ "epoch": 1.9815668202764978,
+ "grad_norm": 0.2946629822254181,
+ "learning_rate": 4.786422235429269e-06,
+ "loss": 0.0122,
+ "step": 144
+ },
+ {
+ "epoch": 1.9953917050691246,
+ "grad_norm": 0.2763853371143341,
+ "learning_rate": 4.776752583932455e-06,
+ "loss": 0.0118,
+ "step": 145
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.2763853371143341,
+ "learning_rate": 4.766879070484957e-06,
+ "loss": 0.0078,
+ "step": 146
+ },
+ {
+ "epoch": 2.013824884792627,
+ "grad_norm": 0.2722196877002716,
+ "learning_rate": 4.756802579167781e-06,
+ "loss": 0.0076,
+ "step": 147
+ },
+ {
+ "epoch": 2.0276497695852536,
+ "grad_norm": 0.18556565046310425,
+ "learning_rate": 4.746524012236706e-06,
+ "loss": 0.0091,
+ "step": 148
+ },
+ {
+ "epoch": 2.0414746543778803,
+ "grad_norm": 0.24442361295223236,
+ "learning_rate": 4.736044290041496e-06,
+ "loss": 0.009,
+ "step": 149
+ },
+ {
+ "epoch": 2.055299539170507,
+ "grad_norm": 0.24207571148872375,
+ "learning_rate": 4.725364350943492e-06,
+ "loss": 0.0085,
+ "step": 150
+ },
+ {
+ "epoch": 2.0691244239631335,
+ "grad_norm": 0.18502290546894073,
+ "learning_rate": 4.714485151231593e-06,
+ "loss": 0.0059,
+ "step": 151
+ },
+ {
+ "epoch": 2.0829493087557602,
+ "grad_norm": 0.3010450303554535,
+ "learning_rate": 4.703407665036622e-06,
+ "loss": 0.0071,
+ "step": 152
+ },
+ {
+ "epoch": 2.096774193548387,
+ "grad_norm": 0.23272967338562012,
+ "learning_rate": 4.692132884244113e-06,
+ "loss": 0.0074,
+ "step": 153
+ },
+ {
+ "epoch": 2.110599078341014,
+ "grad_norm": 0.25476181507110596,
+ "learning_rate": 4.680661818405485e-06,
+ "loss": 0.0082,
+ "step": 154
+ },
+ {
+ "epoch": 2.1244239631336406,
+ "grad_norm": 0.24534538388252258,
+ "learning_rate": 4.668995494647653e-06,
+ "loss": 0.0065,
+ "step": 155
+ },
+ {
+ "epoch": 2.1382488479262673,
+ "grad_norm": 0.1642732173204422,
+ "learning_rate": 4.657134957581057e-06,
+ "loss": 0.0054,
+ "step": 156
+ },
+ {
+ "epoch": 2.152073732718894,
+ "grad_norm": 0.21100501716136932,
+ "learning_rate": 4.645081269206128e-06,
+ "loss": 0.0091,
+ "step": 157
+ },
+ {
+ "epoch": 2.165898617511521,
+ "grad_norm": 0.19043587148189545,
+ "learning_rate": 4.632835508818192e-06,
+ "loss": 0.0047,
+ "step": 158
+ },
+ {
+ "epoch": 2.1797235023041477,
+ "grad_norm": 0.1804375797510147,
+ "learning_rate": 4.620398772910833e-06,
+ "loss": 0.0068,
+ "step": 159
+ },
+ {
+ "epoch": 2.193548387096774,
+ "grad_norm": 0.6586657762527466,
+ "learning_rate": 4.607772175077712e-06,
+ "loss": 0.0049,
+ "step": 160
+ },
+ {
+ "epoch": 2.207373271889401,
+ "grad_norm": 0.18181656301021576,
+ "learning_rate": 4.59495684591285e-06,
+ "loss": 0.0071,
+ "step": 161
+ },
+ {
+ "epoch": 2.2211981566820276,
+ "grad_norm": 0.760053813457489,
+ "learning_rate": 4.581953932909403e-06,
+ "loss": 0.0065,
+ "step": 162
+ },
+ {
+ "epoch": 2.2350230414746544,
+ "grad_norm": 0.1935238242149353,
+ "learning_rate": 4.5687646003569055e-06,
+ "loss": 0.0066,
+ "step": 163
+ },
+ {
+ "epoch": 2.248847926267281,
+ "grad_norm": 0.3035024404525757,
+ "learning_rate": 4.555390029237026e-06,
+ "loss": 0.0046,
+ "step": 164
+ },
+ {
+ "epoch": 2.262672811059908,
+ "grad_norm": 0.16596420109272003,
+ "learning_rate": 4.541831417117815e-06,
+ "loss": 0.007,
+ "step": 165
+ },
+ {
+ "epoch": 2.2764976958525347,
+ "grad_norm": 0.2578873336315155,
+ "learning_rate": 4.528089978046481e-06,
+ "loss": 0.0048,
+ "step": 166
+ },
+ {
+ "epoch": 2.2903225806451615,
+ "grad_norm": 1.7751781940460205,
+ "learning_rate": 4.514166942440679e-06,
+ "loss": 0.0041,
+ "step": 167
+ },
+ {
+ "epoch": 2.3041474654377883,
+ "grad_norm": 0.37872445583343506,
+ "learning_rate": 4.5000635569783365e-06,
+ "loss": 0.0045,
+ "step": 168
+ },
+ {
+ "epoch": 2.3179723502304146,
+ "grad_norm": 0.22949594259262085,
+ "learning_rate": 4.4857810844860325e-06,
+ "loss": 0.0071,
+ "step": 169
+ },
+ {
+ "epoch": 2.3317972350230414,
+ "grad_norm": 0.34662699699401855,
+ "learning_rate": 4.471320803825915e-06,
+ "loss": 0.006,
+ "step": 170
+ },
+ {
+ "epoch": 2.345622119815668,
+ "grad_norm": 0.5892661213874817,
+ "learning_rate": 4.4566840097811956e-06,
+ "loss": 0.0055,
+ "step": 171
+ },
+ {
+ "epoch": 2.359447004608295,
+ "grad_norm": 0.18866907060146332,
+ "learning_rate": 4.4418720129402145e-06,
+ "loss": 0.0036,
+ "step": 172
+ },
+ {
+ "epoch": 2.3732718894009217,
+ "grad_norm": 0.1510942429304123,
+ "learning_rate": 4.426886139579083e-06,
+ "loss": 0.0065,
+ "step": 173
+ },
+ {
+ "epoch": 2.3870967741935485,
+ "grad_norm": 0.21291828155517578,
+ "learning_rate": 4.411727731542937e-06,
+ "loss": 0.004,
+ "step": 174
+ },
+ {
+ "epoch": 2.4009216589861753,
+ "grad_norm": 0.18649035692214966,
+ "learning_rate": 4.39639814612578e-06,
+ "loss": 0.0047,
+ "step": 175
+ },
+ {
+ "epoch": 2.4147465437788016,
+ "grad_norm": 0.19008278846740723,
+ "learning_rate": 4.3808987559489536e-06,
+ "loss": 0.0071,
+ "step": 176
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.26282456517219543,
+ "learning_rate": 4.365230948838232e-06,
+ "loss": 0.0044,
+ "step": 177
+ },
+ {
+ "epoch": 2.442396313364055,
+ "grad_norm": 0.2351403385400772,
+ "learning_rate": 4.349396127699552e-06,
+ "loss": 0.0057,
+ "step": 178
+ },
+ {
+ "epoch": 2.456221198156682,
+ "grad_norm": 0.20451441407203674,
+ "learning_rate": 4.3333957103934025e-06,
+ "loss": 0.003,
+ "step": 179
+ },
+ {
+ "epoch": 2.4700460829493087,
+ "grad_norm": 0.22120380401611328,
+ "learning_rate": 4.317231129607859e-06,
+ "loss": 0.0045,
+ "step": 180
+ },
+ {
+ "epoch": 2.4838709677419355,
+ "grad_norm": 0.18543967604637146,
+ "learning_rate": 4.30090383273031e-06,
+ "loss": 0.0062,
+ "step": 181
+ },
+ {
+ "epoch": 2.4976958525345623,
+ "grad_norm": 0.18473730981349945,
+ "learning_rate": 4.2844152817178476e-06,
+ "loss": 0.0052,
+ "step": 182
+ },
+ {
+ "epoch": 2.511520737327189,
+ "grad_norm": 0.21087361872196198,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.0041,
+ "step": 183
+ },
+ {
+ "epoch": 2.525345622119816,
+ "grad_norm": 0.24977360665798187,
+ "learning_rate": 4.2509603371783776e-06,
+ "loss": 0.0053,
+ "step": 184
+ },
+ {
+ "epoch": 2.539170506912442,
+ "grad_norm": 0.19377018511295319,
+ "learning_rate": 4.233996939229502e-06,
+ "loss": 0.0037,
+ "step": 185
+ },
+ {
+ "epoch": 2.5529953917050694,
+ "grad_norm": 0.21130548417568207,
+ "learning_rate": 4.216878278033753e-06,
+ "loss": 0.0031,
+ "step": 186
+ },
+ {
+ "epoch": 2.5668202764976957,
+ "grad_norm": 0.13288047909736633,
+ "learning_rate": 4.199605886407515e-06,
+ "loss": 0.0029,
+ "step": 187
+ },
+ {
+ "epoch": 2.5806451612903225,
+ "grad_norm": 0.15998876094818115,
+ "learning_rate": 4.1821813109322975e-06,
+ "loss": 0.0031,
+ "step": 188
+ },
+ {
+ "epoch": 2.5944700460829493,
+ "grad_norm": 0.19475506246089935,
+ "learning_rate": 4.164606111816256e-06,
+ "loss": 0.0022,
+ "step": 189
+ },
+ {
+ "epoch": 2.608294930875576,
+ "grad_norm": 0.1446300446987152,
+ "learning_rate": 4.146881862754485e-06,
+ "loss": 0.0025,
+ "step": 190
+ },
+ {
+ "epoch": 2.622119815668203,
+ "grad_norm": 0.13051164150238037,
+ "learning_rate": 4.129010150788112e-06,
+ "loss": 0.0019,
+ "step": 191
+ },
+ {
+ "epoch": 2.6359447004608296,
+ "grad_norm": 0.1953984946012497,
+ "learning_rate": 4.110992576162193e-06,
+ "loss": 0.0021,
+ "step": 192
+ },
+ {
+ "epoch": 2.6497695852534564,
+ "grad_norm": 0.23630598187446594,
+ "learning_rate": 4.092830752182423e-06,
+ "loss": 0.002,
+ "step": 193
+ },
+ {
+ "epoch": 2.6635944700460827,
+ "grad_norm": 0.2919062376022339,
+ "learning_rate": 4.074526305070679e-06,
+ "loss": 0.0017,
+ "step": 194
+ },
+ {
+ "epoch": 2.6774193548387095,
+ "grad_norm": 0.22015534341335297,
+ "learning_rate": 4.056080873819412e-06,
+ "loss": 0.0025,
+ "step": 195
+ },
+ {
+ "epoch": 2.6912442396313363,
+ "grad_norm": 0.9449160099029541,
+ "learning_rate": 4.037496110044885e-06,
+ "loss": 0.0024,
+ "step": 196
+ },
+ {
+ "epoch": 2.705069124423963,
+ "grad_norm": 0.25235581398010254,
+ "learning_rate": 4.018773677839289e-06,
+ "loss": 0.0031,
+ "step": 197
+ },
+ {
+ "epoch": 2.71889400921659,
+ "grad_norm": 0.3098089098930359,
+ "learning_rate": 3.999915253621739e-06,
+ "loss": 0.0019,
+ "step": 198
+ },
+ {
+ "epoch": 2.7327188940092166,
+ "grad_norm": 0.19896291196346283,
+ "learning_rate": 3.980922525988167e-06,
+ "loss": 0.0019,
+ "step": 199
+ },
+ {
+ "epoch": 2.7465437788018434,
+ "grad_norm": 0.21136268973350525,
+ "learning_rate": 3.961797195560118e-06,
+ "loss": 0.0031,
+ "step": 200
+ },
+ {
+ "epoch": 2.76036866359447,
+ "grad_norm": 0.2549005150794983,
+ "learning_rate": 3.942540974832486e-06,
+ "loss": 0.0017,
+ "step": 201
+ },
+ {
+ "epoch": 2.774193548387097,
+ "grad_norm": 0.14762410521507263,
+ "learning_rate": 3.9231555880201655e-06,
+ "loss": 0.0022,
+ "step": 202
+ },
+ {
+ "epoch": 2.7880184331797233,
+ "grad_norm": 0.16235944628715515,
+ "learning_rate": 3.903642770903671e-06,
+ "loss": 0.0012,
+ "step": 203
+ },
+ {
+ "epoch": 2.80184331797235,
+ "grad_norm": 0.1506718099117279,
+ "learning_rate": 3.884004270673711e-06,
+ "loss": 0.0015,
+ "step": 204
+ },
+ {
+ "epoch": 2.815668202764977,
+ "grad_norm": 0.10484135895967484,
+ "learning_rate": 3.864241845774746e-06,
+ "loss": 0.0016,
+ "step": 205
+ },
+ {
+ "epoch": 2.8294930875576036,
+ "grad_norm": 0.7636306285858154,
+ "learning_rate": 3.844357265747531e-06,
+ "loss": 0.0018,
+ "step": 206
+ },
+ {
+ "epoch": 2.8433179723502304,
+ "grad_norm": 0.2242082804441452,
+ "learning_rate": 3.8243523110706736e-06,
+ "loss": 0.0021,
+ "step": 207
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.3264133334159851,
+ "learning_rate": 3.8042287730012117e-06,
+ "loss": 0.0021,
+ "step": 208
+ },
+ {
+ "epoch": 2.870967741935484,
+ "grad_norm": 0.12472204118967056,
+ "learning_rate": 3.7839884534142157e-06,
+ "loss": 0.0011,
+ "step": 209
+ },
+ {
+ "epoch": 2.8847926267281108,
+ "grad_norm": 0.07526414096355438,
+ "learning_rate": 3.7636331646414524e-06,
+ "loss": 0.0017,
+ "step": 210
+ },
+ {
+ "epoch": 2.8986175115207375,
+ "grad_norm": 0.16134843230247498,
+ "learning_rate": 3.7431647293091076e-06,
+ "loss": 0.0019,
+ "step": 211
+ },
+ {
+ "epoch": 2.912442396313364,
+ "grad_norm": 0.14789307117462158,
+ "learning_rate": 3.7225849801745835e-06,
+ "loss": 0.0012,
+ "step": 212
+ },
+ {
+ "epoch": 2.9262672811059907,
+ "grad_norm": 0.13681238889694214,
+ "learning_rate": 3.701895759962397e-06,
+ "loss": 0.0011,
+ "step": 213
+ },
+ {
+ "epoch": 2.9400921658986174,
+ "grad_norm": 0.10747735947370529,
+ "learning_rate": 3.6810989211991777e-06,
+ "loss": 0.0007,
+ "step": 214
+ },
+ {
+ "epoch": 2.953917050691244,
+ "grad_norm": 0.08121375739574432,
+ "learning_rate": 3.6601963260477923e-06,
+ "loss": 0.0005,
+ "step": 215
+ },
+ {
+ "epoch": 2.967741935483871,
+ "grad_norm": 0.0884300246834755,
+ "learning_rate": 3.6391898461406045e-06,
+ "loss": 0.0014,
+ "step": 216
+ },
+ {
+ "epoch": 2.9815668202764978,
+ "grad_norm": 0.18539245426654816,
+ "learning_rate": 3.6180813624118898e-06,
+ "loss": 0.002,
+ "step": 217
+ },
+ {
+ "epoch": 2.9953917050691246,
+ "grad_norm": 0.1257522702217102,
+ "learning_rate": 3.5968727649294134e-06,
+ "loss": 0.0015,
+ "step": 218
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 1.2422555685043335,
+ "learning_rate": 3.575565952725193e-06,
+ "loss": 0.0002,
+ "step": 219
+ },
+ {
+ "epoch": 3.013824884792627,
+ "grad_norm": 0.06009506434202194,
+ "learning_rate": 3.55416283362546e-06,
+ "loss": 0.0003,
+ "step": 220
+ },
+ {
+ "epoch": 3.0276497695852536,
+ "grad_norm": 0.0876953974366188,
+ "learning_rate": 3.5326653240798283e-06,
+ "loss": 0.0005,
+ "step": 221
+ },
+ {
+ "epoch": 3.0414746543778803,
+ "grad_norm": 0.7512914538383484,
+ "learning_rate": 3.5110753489896924e-06,
+ "loss": 0.0007,
+ "step": 222
+ },
+ {
+ "epoch": 3.055299539170507,
+ "grad_norm": 0.08451899141073227,
+ "learning_rate": 3.4893948415358803e-06,
+ "loss": 0.0009,
+ "step": 223
+ },
+ {
+ "epoch": 3.0691244239631335,
+ "grad_norm": 0.15445305407047272,
+ "learning_rate": 3.4676257430055438e-06,
+ "loss": 0.0006,
+ "step": 224
+ },
+ {
+ "epoch": 3.0829493087557602,
+ "grad_norm": 0.07909094542264938,
+ "learning_rate": 3.4457700026183378e-06,
+ "loss": 0.0004,
+ "step": 225
+ },
+ {
+ "epoch": 3.096774193548387,
+ "grad_norm": 0.03637247905135155,
+ "learning_rate": 3.4238295773518924e-06,
+ "loss": 0.0003,
+ "step": 226
+ },
+ {
+ "epoch": 3.110599078341014,
+ "grad_norm": 0.203308567404747,
+ "learning_rate": 3.4018064317665745e-06,
+ "loss": 0.0003,
+ "step": 227
+ },
+ {
+ "epoch": 3.1244239631336406,
+ "grad_norm": 0.03239201754331589,
+ "learning_rate": 3.3797025378295826e-06,
+ "loss": 0.0002,
+ "step": 228
+ },
+ {
+ "epoch": 3.1382488479262673,
+ "grad_norm": 0.07106538861989975,
+ "learning_rate": 3.357519874738382e-06,
+ "loss": 0.0004,
+ "step": 229
+ },
+ {
+ "epoch": 3.152073732718894,
+ "grad_norm": 0.048268985003232956,
+ "learning_rate": 3.3352604287434752e-06,
+ "loss": 0.0003,
+ "step": 230
+ },
+ {
+ "epoch": 3.165898617511521,
+ "grad_norm": 0.0841558575630188,
+ "learning_rate": 3.31292619297056e-06,
+ "loss": 0.0003,
+ "step": 231
+ },
+ {
+ "epoch": 3.1797235023041477,
+ "grad_norm": 0.07029678672552109,
+ "learning_rate": 3.29051916724206e-06,
+ "loss": 0.0003,
+ "step": 232
+ },
+ {
+ "epoch": 3.193548387096774,
+ "grad_norm": 0.11369964480400085,
+ "learning_rate": 3.2680413578980623e-06,
+ "loss": 0.0014,
+ "step": 233
+ },
+ {
+ "epoch": 3.207373271889401,
+ "grad_norm": 0.0367964468896389,
+ "learning_rate": 3.245494777616664e-06,
+ "loss": 0.0001,
+ "step": 234
+ },
+ {
+ "epoch": 3.2211981566820276,
+ "grad_norm": 0.13746097683906555,
+ "learning_rate": 3.2228814452337587e-06,
+ "loss": 0.0003,
+ "step": 235
+ },
+ {
+ "epoch": 3.2350230414746544,
+ "grad_norm": 0.09046189486980438,
+ "learning_rate": 3.2002033855622683e-06,
+ "loss": 0.0004,
+ "step": 236
+ },
+ {
+ "epoch": 3.248847926267281,
+ "grad_norm": 0.04587667062878609,
+ "learning_rate": 3.177462629210838e-06,
+ "loss": 0.0002,
+ "step": 237
+ },
+ {
+ "epoch": 3.262672811059908,
+ "grad_norm": 0.11323168128728867,
+ "learning_rate": 3.154661212402017e-06,
+ "loss": 0.0003,
+ "step": 238
+ },
+ {
+ "epoch": 3.2764976958525347,
+ "grad_norm": 0.04728177189826965,
+ "learning_rate": 3.131801176789934e-06,
+ "loss": 0.0002,
+ "step": 239
+ },
+ {
+ "epoch": 3.2903225806451615,
+ "grad_norm": 0.527999997138977,
+ "learning_rate": 3.1088845692774798e-06,
+ "loss": 0.0008,
+ "step": 240
+ },
+ {
+ "epoch": 3.3041474654377883,
+ "grad_norm": 0.026646027341485023,
+ "learning_rate": 3.0859134418330373e-06,
+ "loss": 0.0001,
+ "step": 241
+ },
+ {
+ "epoch": 3.3179723502304146,
+ "grad_norm": 0.057450197637081146,
+ "learning_rate": 3.0628898513067357e-06,
+ "loss": 0.0004,
+ "step": 242
+ },
+ {
+ "epoch": 3.3317972350230414,
+ "grad_norm": 0.08258494734764099,
+ "learning_rate": 3.0398158592462847e-06,
+ "loss": 0.0005,
+ "step": 243
+ },
+ {
+ "epoch": 3.345622119815668,
+ "grad_norm": 0.01878846250474453,
+ "learning_rate": 3.0166935317123824e-06,
+ "loss": 0.0001,
+ "step": 244
+ },
+ {
+ "epoch": 3.359447004608295,
+ "grad_norm": 0.041918545961380005,
+ "learning_rate": 2.9935249390937184e-06,
+ "loss": 0.0002,
+ "step": 245
+ },
+ {
+ "epoch": 3.3732718894009217,
+ "grad_norm": 0.04018491134047508,
+ "learning_rate": 2.970312155921585e-06,
+ "loss": 0.0002,
+ "step": 246
+ },
+ {
+ "epoch": 3.3870967741935485,
+ "grad_norm": 0.040825020521879196,
+ "learning_rate": 2.9470572606841295e-06,
+ "loss": 0.0002,
+ "step": 247
+ },
+ {
+ "epoch": 3.4009216589861753,
+ "grad_norm": 0.050590481609106064,
+ "learning_rate": 2.9237623356402423e-06,
+ "loss": 0.0002,
+ "step": 248
+ },
+ {
+ "epoch": 3.4147465437788016,
+ "grad_norm": 0.07999978959560394,
+ "learning_rate": 2.900429466633107e-06,
+ "loss": 0.0002,
+ "step": 249
+ },
+ {
+ "epoch": 3.4285714285714284,
+ "grad_norm": 0.02137935161590576,
+ "learning_rate": 2.8770607429034352e-06,
+ "loss": 0.0001,
+ "step": 250
+ },
+ {
+ "epoch": 3.442396313364055,
+ "grad_norm": 0.18967340886592865,
+ "learning_rate": 2.8536582569023964e-06,
+ "loss": 0.0007,
+ "step": 251
+ },
+ {
+ "epoch": 3.456221198156682,
+ "grad_norm": 0.03681226819753647,
+ "learning_rate": 2.8302241041042564e-06,
+ "loss": 0.0001,
+ "step": 252
+ },
+ {
+ "epoch": 3.4700460829493087,
+ "grad_norm": 0.03142761439085007,
+ "learning_rate": 2.8067603828187446e-06,
+ "loss": 0.0001,
+ "step": 253
+ },
+ {
+ "epoch": 3.4838709677419355,
+ "grad_norm": 0.11318890005350113,
+ "learning_rate": 2.7832691940031755e-06,
+ "loss": 0.0005,
+ "step": 254
+ },
+ {
+ "epoch": 3.4976958525345623,
+ "grad_norm": 0.047176819294691086,
+ "learning_rate": 2.759752641074322e-06,
+ "loss": 0.0002,
+ "step": 255
+ },
+ {
+ "epoch": 3.511520737327189,
+ "grad_norm": 0.0642286092042923,
+ "learning_rate": 2.7362128297200784e-06,
+ "loss": 0.0002,
+ "step": 256
+ },
+ {
+ "epoch": 3.525345622119816,
+ "grad_norm": 0.09328105300664902,
+ "learning_rate": 2.712651867710914e-06,
+ "loss": 0.0004,
+ "step": 257
+ },
+ {
+ "epoch": 3.539170506912442,
+ "grad_norm": 0.08150269836187363,
+ "learning_rate": 2.6890718647111424e-06,
+ "loss": 0.0007,
+ "step": 258
+ },
+ {
+ "epoch": 3.5529953917050694,
+ "grad_norm": 0.03366294875741005,
+ "learning_rate": 2.665474932090017e-06,
+ "loss": 0.0001,
+ "step": 259
+ },
+ {
+ "epoch": 3.5668202764976957,
+ "grad_norm": 0.032316725701093674,
+ "learning_rate": 2.6418631827326857e-06,
+ "loss": 0.0001,
+ "step": 260
+ },
+ {
+ "epoch": 3.5806451612903225,
+ "grad_norm": 0.02776617370545864,
+ "learning_rate": 2.6182387308509927e-06,
+ "loss": 0.0001,
+ "step": 261
+ },
+ {
+ "epoch": 3.5944700460829493,
+ "grad_norm": 0.1258484572172165,
+ "learning_rate": 2.5946036917941765e-06,
+ "loss": 0.0003,
+ "step": 262
+ },
+ {
+ "epoch": 3.608294930875576,
+ "grad_norm": 0.04412033408880234,
+ "learning_rate": 2.570960181859458e-06,
+ "loss": 0.0003,
+ "step": 263
+ },
+ {
+ "epoch": 3.622119815668203,
+ "grad_norm": 0.016816483810544014,
+ "learning_rate": 2.547310318102548e-06,
+ "loss": 0.0001,
+ "step": 264
+ },
+ {
+ "epoch": 3.6359447004608296,
+ "grad_norm": 0.028503524139523506,
+ "learning_rate": 2.5236562181480794e-06,
+ "loss": 0.0001,
+ "step": 265
+ },
+ {
+ "epoch": 3.6497695852534564,
+ "grad_norm": 0.03991785645484924,
+ "learning_rate": 2.5e-06,
+ "loss": 0.0002,
+ "step": 266
+ },
+ {
+ "epoch": 3.6635944700460827,
+ "grad_norm": 0.07638856768608093,
+ "learning_rate": 2.4763437818519205e-06,
+ "loss": 0.0002,
+ "step": 267
+ },
+ {
+ "epoch": 3.6774193548387095,
+ "grad_norm": 0.032387226819992065,
+ "learning_rate": 2.4526896818974534e-06,
+ "loss": 0.0002,
+ "step": 268
+ },
+ {
+ "epoch": 3.6912442396313363,
+ "grad_norm": 0.035975128412246704,
+ "learning_rate": 2.429039818140543e-06,
+ "loss": 0.0002,
+ "step": 269
+ },
+ {
+ "epoch": 3.705069124423963,
+ "grad_norm": 0.021173926070332527,
+ "learning_rate": 2.405396308205825e-06,
+ "loss": 0.0001,
+ "step": 270
+ },
+ {
+ "epoch": 3.71889400921659,
+ "grad_norm": 0.005446314811706543,
+ "learning_rate": 2.381761269149009e-06,
+ "loss": 0.0,
+ "step": 271
+ },
+ {
+ "epoch": 3.7327188940092166,
+ "grad_norm": 0.04019308090209961,
+ "learning_rate": 2.358136817267315e-06,
+ "loss": 0.0001,
+ "step": 272
+ },
+ {
+ "epoch": 3.7465437788018434,
+ "grad_norm": 0.0222685057669878,
+ "learning_rate": 2.334525067909983e-06,
+ "loss": 0.0001,
+ "step": 273
+ },
+ {
+ "epoch": 3.76036866359447,
+ "grad_norm": 0.02486710622906685,
+ "learning_rate": 2.3109281352888593e-06,
+ "loss": 0.0001,
+ "step": 274
+ },
+ {
+ "epoch": 3.774193548387097,
+ "grad_norm": 0.01929207146167755,
+ "learning_rate": 2.2873481322890866e-06,
+ "loss": 0.0001,
+ "step": 275
+ },
+ {
+ "epoch": 3.7880184331797233,
+ "grad_norm": 0.010686581023037434,
+ "learning_rate": 2.263787170279922e-06,
+ "loss": 0.0,
+ "step": 276
+ },
+ {
+ "epoch": 3.80184331797235,
+ "grad_norm": 0.04710806906223297,
+ "learning_rate": 2.2402473589256793e-06,
+ "loss": 0.0001,
+ "step": 277
+ },
+ {
+ "epoch": 3.815668202764977,
+ "grad_norm": 0.00774085009470582,
+ "learning_rate": 2.2167308059968258e-06,
+ "loss": 0.0,
+ "step": 278
+ },
+ {
+ "epoch": 3.8294930875576036,
+ "grad_norm": 0.00735470512881875,
+ "learning_rate": 2.193239617181256e-06,
+ "loss": 0.0,
+ "step": 279
+ },
+ {
+ "epoch": 3.8433179723502304,
+ "grad_norm": 0.005572167690843344,
+ "learning_rate": 2.169775895895745e-06,
+ "loss": 0.0,
+ "step": 280
+ },
+ {
+ "epoch": 3.857142857142857,
+ "grad_norm": 0.07026448100805283,
+ "learning_rate": 2.146341743097604e-06,
+ "loss": 0.0004,
+ "step": 281
+ },
+ {
+ "epoch": 3.870967741935484,
+ "grad_norm": 0.03968067839741707,
+ "learning_rate": 2.1229392570965656e-06,
+ "loss": 0.0001,
+ "step": 282
+ },
+ {
+ "epoch": 3.8847926267281108,
+ "grad_norm": 0.002730958629399538,
+ "learning_rate": 2.0995705333668948e-06,
+ "loss": 0.0,
+ "step": 283
+ },
+ {
+ "epoch": 3.8986175115207375,
+ "grad_norm": 0.010703709907829762,
+ "learning_rate": 2.0762376643597586e-06,
+ "loss": 0.0,
+ "step": 284
+ },
+ {
+ "epoch": 3.912442396313364,
+ "grad_norm": 0.03527766093611717,
+ "learning_rate": 2.0529427393158704e-06,
+ "loss": 0.0001,
+ "step": 285
+ },
+ {
+ "epoch": 3.9262672811059907,
+ "grad_norm": 0.03926033526659012,
+ "learning_rate": 2.0296878440784164e-06,
+ "loss": 0.0001,
+ "step": 286
+ },
+ {
+ "epoch": 3.9400921658986174,
+ "grad_norm": 0.007335775997489691,
+ "learning_rate": 2.006475060906283e-06,
+ "loss": 0.0,
+ "step": 287
+ },
+ {
+ "epoch": 3.953917050691244,
+ "grad_norm": 0.005718631204217672,
+ "learning_rate": 1.9833064682876175e-06,
+ "loss": 0.0,
+ "step": 288
+ },
+ {
+ "epoch": 3.967741935483871,
+ "grad_norm": 0.005941327195614576,
+ "learning_rate": 1.9601841407537157e-06,
+ "loss": 0.0,
+ "step": 289
+ },
+ {
+ "epoch": 3.9815668202764978,
+ "grad_norm": 0.039281055331230164,
+ "learning_rate": 1.937110148693265e-06,
+ "loss": 0.0001,
+ "step": 290
+ },
+ {
+ "epoch": 3.9953917050691246,
+ "grad_norm": 0.06976872682571411,
+ "learning_rate": 1.9140865581669627e-06,
+ "loss": 0.0001,
+ "step": 291
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 0.06976872682571411,
+ "learning_rate": 1.8911154307225204e-06,
+ "loss": 0.0,
+ "step": 292
+ },
+ {
+ "epoch": 4.013824884792626,
+ "grad_norm": 0.005908307619392872,
+ "learning_rate": 1.8681988232100674e-06,
+ "loss": 0.0,
+ "step": 293
+ },
+ {
+ "epoch": 4.027649769585254,
+ "grad_norm": 0.022486088797450066,
+ "learning_rate": 1.8453387875979834e-06,
+ "loss": 0.0001,
+ "step": 294
+ },
+ {
+ "epoch": 4.04147465437788,
+ "grad_norm": 0.0074249873869121075,
+ "learning_rate": 1.822537370789163e-06,
+ "loss": 0.0,
+ "step": 295
+ },
+ {
+ "epoch": 4.055299539170507,
+ "grad_norm": 0.004768090322613716,
+ "learning_rate": 1.7997966144377328e-06,
+ "loss": 0.0,
+ "step": 296
+ },
+ {
+ "epoch": 4.0691244239631335,
+ "grad_norm": 0.013053408823907375,
+ "learning_rate": 1.7771185547662417e-06,
+ "loss": 0.0,
+ "step": 297
+ },
+ {
+ "epoch": 4.082949308755761,
+ "grad_norm": 0.00568437110632658,
+ "learning_rate": 1.754505222383337e-06,
+ "loss": 0.0,
+ "step": 298
+ },
+ {
+ "epoch": 4.096774193548387,
+ "grad_norm": 0.006704168394207954,
+ "learning_rate": 1.7319586421019383e-06,
+ "loss": 0.0,
+ "step": 299
+ },
+ {
+ "epoch": 4.110599078341014,
+ "grad_norm": 0.0039120810106396675,
+ "learning_rate": 1.7094808327579401e-06,
+ "loss": 0.0,
+ "step": 300
+ },
+ {
+ "epoch": 4.124423963133641,
+ "grad_norm": 0.009206798858940601,
+ "learning_rate": 1.6870738070294412e-06,
+ "loss": 0.0,
+ "step": 301
+ },
+ {
+ "epoch": 4.138248847926267,
+ "grad_norm": 0.005304583813995123,
+ "learning_rate": 1.6647395712565256e-06,
+ "loss": 0.0,
+ "step": 302
+ },
+ {
+ "epoch": 4.152073732718894,
+ "grad_norm": 0.008103611879050732,
+ "learning_rate": 1.6424801252616186e-06,
+ "loss": 0.0001,
+ "step": 303
+ },
+ {
+ "epoch": 4.1658986175115205,
+ "grad_norm": 0.028891608119010925,
+ "learning_rate": 1.6202974621704176e-06,
+ "loss": 0.0,
+ "step": 304
+ },
+ {
+ "epoch": 4.179723502304148,
+ "grad_norm": 0.0035763406194746494,
+ "learning_rate": 1.5981935682334266e-06,
+ "loss": 0.0,
+ "step": 305
+ },
+ {
+ "epoch": 4.193548387096774,
+ "grad_norm": 0.009718772955238819,
+ "learning_rate": 1.5761704226481078e-06,
+ "loss": 0.0,
+ "step": 306
+ },
+ {
+ "epoch": 4.207373271889401,
+ "grad_norm": 0.01045698020607233,
+ "learning_rate": 1.5542299973816626e-06,
+ "loss": 0.0,
+ "step": 307
+ },
+ {
+ "epoch": 4.221198156682028,
+ "grad_norm": 0.004575685132294893,
+ "learning_rate": 1.5323742569944573e-06,
+ "loss": 0.0,
+ "step": 308
+ },
+ {
+ "epoch": 4.235023041474655,
+ "grad_norm": 0.003245371161028743,
+ "learning_rate": 1.5106051584641208e-06,
+ "loss": 0.0,
+ "step": 309
+ },
+ {
+ "epoch": 4.248847926267281,
+ "grad_norm": 0.005619620904326439,
+ "learning_rate": 1.4889246510103078e-06,
+ "loss": 0.0,
+ "step": 310
+ },
+ {
+ "epoch": 4.2626728110599075,
+ "grad_norm": 0.004715710878372192,
+ "learning_rate": 1.4673346759201728e-06,
+ "loss": 0.0,
+ "step": 311
+ },
+ {
+ "epoch": 4.276497695852535,
+ "grad_norm": 0.007476332131773233,
+ "learning_rate": 1.44583716637454e-06,
+ "loss": 0.0,
+ "step": 312
+ },
+ {
+ "epoch": 4.290322580645161,
+ "grad_norm": 0.01739400625228882,
+ "learning_rate": 1.4244340472748076e-06,
+ "loss": 0.0001,
+ "step": 313
+ },
+ {
+ "epoch": 4.304147465437788,
+ "grad_norm": 0.00816753227263689,
+ "learning_rate": 1.403127235070587e-06,
+ "loss": 0.0,
+ "step": 314
+ },
+ {
+ "epoch": 4.317972350230415,
+ "grad_norm": 0.010216044262051582,
+ "learning_rate": 1.381918637588112e-06,
+ "loss": 0.0,
+ "step": 315
+ },
+ {
+ "epoch": 4.331797235023042,
+ "grad_norm": 0.004990486893802881,
+ "learning_rate": 1.3608101538593965e-06,
+ "loss": 0.0,
+ "step": 316
+ },
+ {
+ "epoch": 4.345622119815668,
+ "grad_norm": 0.004758649505674839,
+ "learning_rate": 1.3398036739522088e-06,
+ "loss": 0.0001,
+ "step": 317
+ },
+ {
+ "epoch": 4.359447004608295,
+ "grad_norm": 0.041808340698480606,
+ "learning_rate": 1.3189010788008234e-06,
+ "loss": 0.0,
+ "step": 318
+ },
+ {
+ "epoch": 4.373271889400922,
+ "grad_norm": 0.012711254879832268,
+ "learning_rate": 1.2981042400376032e-06,
+ "loss": 0.0,
+ "step": 319
+ },
+ {
+ "epoch": 4.387096774193548,
+ "grad_norm": 0.0035697701387107372,
+ "learning_rate": 1.277415019825417e-06,
+ "loss": 0.0,
+ "step": 320
+ },
+ {
+ "epoch": 4.400921658986175,
+ "grad_norm": 0.005487007088959217,
+ "learning_rate": 1.2568352706908937e-06,
+ "loss": 0.0,
+ "step": 321
+ },
+ {
+ "epoch": 4.414746543778802,
+ "grad_norm": 0.01304635126143694,
+ "learning_rate": 1.2363668353585486e-06,
+ "loss": 0.0,
+ "step": 322
+ },
+ {
+ "epoch": 4.428571428571429,
+ "grad_norm": 0.0019787494093179703,
+ "learning_rate": 1.216011546585785e-06,
+ "loss": 0.0,
+ "step": 323
+ },
+ {
+ "epoch": 4.442396313364055,
+ "grad_norm": 0.00808583851903677,
+ "learning_rate": 1.195771226998789e-06,
+ "loss": 0.0,
+ "step": 324
+ },
+ {
+ "epoch": 4.456221198156682,
+ "grad_norm": 0.0022094689775258303,
+ "learning_rate": 1.1756476889293269e-06,
+ "loss": 0.0,
+ "step": 325
+ },
+ {
+ "epoch": 4.470046082949309,
+ "grad_norm": 0.012792594730854034,
+ "learning_rate": 1.1556427342524698e-06,
+ "loss": 0.0,
+ "step": 326
+ },
+ {
+ "epoch": 4.483870967741936,
+ "grad_norm": 0.006805351935327053,
+ "learning_rate": 1.1357581542252555e-06,
+ "loss": 0.0,
+ "step": 327
+ },
+ {
+ "epoch": 4.497695852534562,
+ "grad_norm": 0.003740285988897085,
+ "learning_rate": 1.1159957293262888e-06,
+ "loss": 0.0,
+ "step": 328
+ },
+ {
+ "epoch": 4.511520737327189,
+ "grad_norm": 0.009705561213195324,
+ "learning_rate": 1.0963572290963298e-06,
+ "loss": 0.0,
+ "step": 329
+ },
+ {
+ "epoch": 4.525345622119816,
+ "grad_norm": 0.040002401918172836,
+ "learning_rate": 1.0768444119798357e-06,
+ "loss": 0.0002,
+ "step": 330
+ },
+ {
+ "epoch": 4.539170506912442,
+ "grad_norm": 0.0036789420992136,
+ "learning_rate": 1.0574590251675145e-06,
+ "loss": 0.0,
+ "step": 331
+ },
+ {
+ "epoch": 4.552995391705069,
+ "grad_norm": 0.004043960478156805,
+ "learning_rate": 1.0382028044398823e-06,
+ "loss": 0.0002,
+ "step": 332
+ },
+ {
+ "epoch": 4.566820276497696,
+ "grad_norm": 0.0512581467628479,
+ "learning_rate": 1.0190774740118343e-06,
+ "loss": 0.0,
+ "step": 333
+ },
+ {
+ "epoch": 4.580645161290323,
+ "grad_norm": 0.004926969762891531,
+ "learning_rate": 1.0000847463782615e-06,
+ "loss": 0.0,
+ "step": 334
+ },
+ {
+ "epoch": 4.594470046082949,
+ "grad_norm": 0.0043294900096952915,
+ "learning_rate": 9.812263221607114e-07,
+ "loss": 0.0,
+ "step": 335
+ },
+ {
+ "epoch": 4.6082949308755765,
+ "grad_norm": 0.0023195091634988785,
+ "learning_rate": 9.625038899551162e-07,
+ "loss": 0.0,
+ "step": 336
+ },
+ {
+ "epoch": 4.622119815668203,
+ "grad_norm": 0.0015059575671330094,
+ "learning_rate": 9.439191261805894e-07,
+ "loss": 0.0,
+ "step": 337
+ },
+ {
+ "epoch": 4.635944700460829,
+ "grad_norm": 0.001368862227536738,
+ "learning_rate": 9.254736949293216e-07,
+ "loss": 0.0,
+ "step": 338
+ },
+ {
+ "epoch": 4.649769585253456,
+ "grad_norm": 0.008128674700856209,
+ "learning_rate": 9.07169247817579e-07,
+ "loss": 0.0,
+ "step": 339
+ },
+ {
+ "epoch": 4.663594470046083,
+ "grad_norm": 0.0029226879123598337,
+ "learning_rate": 8.890074238378074e-07,
+ "loss": 0.0,
+ "step": 340
+ },
+ {
+ "epoch": 4.67741935483871,
+ "grad_norm": 0.0012331035686656833,
+ "learning_rate": 8.709898492118885e-07,
+ "loss": 0.0,
+ "step": 341
+ },
+ {
+ "epoch": 4.691244239631336,
+ "grad_norm": 0.005286338273435831,
+ "learning_rate": 8.531181372455161e-07,
+ "loss": 0.0,
+ "step": 342
+ },
+ {
+ "epoch": 4.705069124423963,
+ "grad_norm": 0.0026836844626814127,
+ "learning_rate": 8.353938881837445e-07,
+ "loss": 0.0,
+ "step": 343
+ },
+ {
+ "epoch": 4.71889400921659,
+ "grad_norm": 0.013100259937345982,
+ "learning_rate": 8.178186890677029e-07,
+ "loss": 0.0,
+ "step": 344
+ },
+ {
+ "epoch": 4.732718894009217,
+ "grad_norm": 0.005650435108691454,
+ "learning_rate": 8.003941135924859e-07,
+ "loss": 0.0,
+ "step": 345
+ },
+ {
+ "epoch": 4.746543778801843,
+ "grad_norm": 0.007480297237634659,
+ "learning_rate": 7.83121721966248e-07,
+ "loss": 0.0,
+ "step": 346
+ },
+ {
+ "epoch": 4.76036866359447,
+ "grad_norm": 0.014115474186837673,
+ "learning_rate": 7.66003060770498e-07,
+ "loss": 0.0,
+ "step": 347
+ },
+ {
+ "epoch": 4.774193548387097,
+ "grad_norm": 0.0011564996093511581,
+ "learning_rate": 7.490396628216237e-07,
+ "loss": 0.0,
+ "step": 348
+ },
+ {
+ "epoch": 4.788018433179723,
+ "grad_norm": 0.01101834885776043,
+ "learning_rate": 7.322330470336314e-07,
+ "loss": 0.0,
+ "step": 349
+ },
+ {
+ "epoch": 4.8018433179723505,
+ "grad_norm": 0.003535608062520623,
+ "learning_rate": 7.155847182821524e-07,
+ "loss": 0.0,
+ "step": 350
+ },
+ {
+ "epoch": 4.815668202764977,
+ "grad_norm": 0.03185940906405449,
+ "learning_rate": 6.990961672696908e-07,
+ "loss": 0.0001,
+ "step": 351
+ },
+ {
+ "epoch": 4.829493087557603,
+ "grad_norm": 0.006516721565276384,
+ "learning_rate": 6.827688703921407e-07,
+ "loss": 0.0,
+ "step": 352
+ },
+ {
+ "epoch": 4.84331797235023,
+ "grad_norm": 0.008277276530861855,
+ "learning_rate": 6.666042896065983e-07,
+ "loss": 0.0,
+ "step": 353
+ },
+ {
+ "epoch": 4.857142857142857,
+ "grad_norm": 0.00266360049135983,
+ "learning_rate": 6.506038723004484e-07,
+ "loss": 0.0,
+ "step": 354
+ },
+ {
+ "epoch": 4.870967741935484,
+ "grad_norm": 0.01671386882662773,
+ "learning_rate": 6.347690511617693e-07,
+ "loss": 0.0,
+ "step": 355
+ },
+ {
+ "epoch": 4.88479262672811,
+ "grad_norm": 0.013981528580188751,
+ "learning_rate": 6.191012440510469e-07,
+ "loss": 0.0001,
+ "step": 356
+ },
+ {
+ "epoch": 4.8986175115207375,
+ "grad_norm": 0.02350999414920807,
+ "learning_rate": 6.036018538742208e-07,
+ "loss": 0.0,
+ "step": 357
+ },
+ {
+ "epoch": 4.912442396313364,
+ "grad_norm": 0.004093356896191835,
+ "learning_rate": 5.882722684570638e-07,
+ "loss": 0.0,
+ "step": 358
+ },
+ {
+ "epoch": 4.926267281105991,
+ "grad_norm": 0.00656296918168664,
+ "learning_rate": 5.731138604209169e-07,
+ "loss": 0.0,
+ "step": 359
+ },
+ {
+ "epoch": 4.940092165898617,
+ "grad_norm": 0.002148544415831566,
+ "learning_rate": 5.581279870597866e-07,
+ "loss": 0.0,
+ "step": 360
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 432,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 72,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.298790820660537e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-360/training_args.bin b/checkpoint-360/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..48c70fd554062e31c1333fa196fcbd6a4f178c6c
--- /dev/null
+++ b/checkpoint-360/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1d334ff15891d240486e54641859af8de96cab43a64ef7bf9dc417387365ae5
+size 7928
diff --git a/checkpoint-360/zero_to_fp32.py b/checkpoint-360/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-360/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-432/README.md b/checkpoint-432/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fdf619c317c2fe82074662582dbd68166b6f9d50
--- /dev/null
+++ b/checkpoint-432/README.md
@@ -0,0 +1,202 @@
+---
+base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-432/adapter_config.json b/checkpoint-432/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3abb5d68d20446d2b99ace226d6233a68590205a
--- /dev/null
+++ b/checkpoint-432/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "up_proj",
+ "down_proj",
+ "gate_proj",
+ "v_proj",
+ "k_proj",
+ "q_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-432/adapter_model.safetensors b/checkpoint-432/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..687651e596b08c7a9d61272bbcf8fa414e20c651
--- /dev/null
+++ b/checkpoint-432/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d972b408cb0b67bf1dc53652467b5e0918debd7a39c06db3c21ed66e350d48d
+size 10829849744
diff --git a/checkpoint-432/global_step428/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-432/global_step428/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a20d3fbaefe6ff706b8b36da65b62d8c5bd6b041
--- /dev/null
+++ b/checkpoint-432/global_step428/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df71b64957e981866498bc0071c5a309365f5fb8ef132b109269ac17cf905231
+size 21659418140
diff --git a/checkpoint-432/global_step428/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-432/global_step428/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..932247be2aefec73e544c1efd620a451163b6e8a
--- /dev/null
+++ b/checkpoint-432/global_step428/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cc89cbc0ecd1cdcf31ce8fea2a654dca41c4b8dcc2e23f3007b73f82d4896a9
+size 21659457372
diff --git a/checkpoint-432/global_step428/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-432/global_step428/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a94b384ecd9bf936ce0b454cf052cc0fb4c0e4a4
--- /dev/null
+++ b/checkpoint-432/global_step428/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9259e987ce5e0604a7f37ee9f63d59573548a4646de62e50169db98cecb4aec
+size 21659417820
diff --git a/checkpoint-432/global_step428/mp_rank_00_model_states.pt b/checkpoint-432/global_step428/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..28bae9e4091009ad64967d25626425af214c1afc
--- /dev/null
+++ b/checkpoint-432/global_step428/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb04f6ede1f20a86580ba67fbf550b5d85ebe2c69aed904b9cde793f8d9083d5
+size 11918643933
diff --git a/checkpoint-432/latest b/checkpoint-432/latest
new file mode 100644
index 0000000000000000000000000000000000000000..db93d1b9337260f662cac94ff5f4520d8048fcbe
--- /dev/null
+++ b/checkpoint-432/latest
@@ -0,0 +1 @@
+global_step428
\ No newline at end of file
diff --git a/checkpoint-432/rng_state_0.pth b/checkpoint-432/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..a89fd50e32bffb1b71cc50c592cac24041f46e53
--- /dev/null
+++ b/checkpoint-432/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38a554d8f4ca301ef4339ae5943933d7d828788b5c3ec62e5534832636848338
+size 14768
diff --git a/checkpoint-432/rng_state_1.pth b/checkpoint-432/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f2b366d0d973075aaf88aed166090ddee7817d31
--- /dev/null
+++ b/checkpoint-432/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:555bd819bfd5c9e74a09c452bad6a77cb4c1d85a2971afc119df333bdf74b0c5
+size 14768
diff --git a/checkpoint-432/rng_state_2.pth b/checkpoint-432/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..c715c8cc5db6813954ca48de1b841e5834b862aa
--- /dev/null
+++ b/checkpoint-432/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6489a9e9a11061acc429e2cf8ec9f616727e146cf444de72f8062c7c5a24a28d
+size 14768
diff --git a/checkpoint-432/scheduler.pt b/checkpoint-432/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..de62fc323d19a098639523b3d978824d194046f7
--- /dev/null
+++ b/checkpoint-432/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:111ca90245be96ef90c53bd1dd14d193f8b0438ebf2268676c1e7ceb5e4eb4c1
+size 1064
diff --git a/checkpoint-432/special_tokens_map.json b/checkpoint-432/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-432/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-432/tokenizer.json b/checkpoint-432/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-432/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-432/tokenizer_config.json b/checkpoint-432/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdde94c29816839ec3c29d6c6461206a49911f3c
--- /dev/null
+++ b/checkpoint-432/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-432/trainer_state.json b/checkpoint-432/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf333bacd624c0a92892178c2da6746b0b59a54e
--- /dev/null
+++ b/checkpoint-432/trainer_state.json
@@ -0,0 +1,3057 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 5.926267281105991,
+ "eval_steps": 500,
+ "global_step": 432,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.013824884792626729,
+ "grad_norm": 34.963134765625,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 2.5476,
+ "step": 1
+ },
+ {
+ "epoch": 0.027649769585253458,
+ "grad_norm": 35.32600021362305,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 2.6058,
+ "step": 2
+ },
+ {
+ "epoch": 0.041474654377880185,
+ "grad_norm": 34.955448150634766,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 2.5871,
+ "step": 3
+ },
+ {
+ "epoch": 0.055299539170506916,
+ "grad_norm": 35.09806442260742,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 2.5912,
+ "step": 4
+ },
+ {
+ "epoch": 0.06912442396313365,
+ "grad_norm": 34.88739776611328,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 2.592,
+ "step": 5
+ },
+ {
+ "epoch": 0.08294930875576037,
+ "grad_norm": 34.84288024902344,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 2.5609,
+ "step": 6
+ },
+ {
+ "epoch": 0.0967741935483871,
+ "grad_norm": 35.0090217590332,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 2.5651,
+ "step": 7
+ },
+ {
+ "epoch": 0.11059907834101383,
+ "grad_norm": 35.03983688354492,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 2.5437,
+ "step": 8
+ },
+ {
+ "epoch": 0.12442396313364056,
+ "grad_norm": 34.802833557128906,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 2.5448,
+ "step": 9
+ },
+ {
+ "epoch": 0.1382488479262673,
+ "grad_norm": 34.5220947265625,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 2.504,
+ "step": 10
+ },
+ {
+ "epoch": 0.15207373271889402,
+ "grad_norm": 34.401580810546875,
+ "learning_rate": 5.5e-07,
+ "loss": 2.4814,
+ "step": 11
+ },
+ {
+ "epoch": 0.16589861751152074,
+ "grad_norm": 33.76997375488281,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 2.4282,
+ "step": 12
+ },
+ {
+ "epoch": 0.17972350230414746,
+ "grad_norm": 33.53415298461914,
+ "learning_rate": 6.5e-07,
+ "loss": 2.4216,
+ "step": 13
+ },
+ {
+ "epoch": 0.1935483870967742,
+ "grad_norm": 32.401580810546875,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 2.3362,
+ "step": 14
+ },
+ {
+ "epoch": 0.2073732718894009,
+ "grad_norm": 33.636661529541016,
+ "learning_rate": 7.5e-07,
+ "loss": 2.2978,
+ "step": 15
+ },
+ {
+ "epoch": 0.22119815668202766,
+ "grad_norm": 31.3782901763916,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 2.1358,
+ "step": 16
+ },
+ {
+ "epoch": 0.2350230414746544,
+ "grad_norm": 30.72391700744629,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 2.0652,
+ "step": 17
+ },
+ {
+ "epoch": 0.2488479262672811,
+ "grad_norm": 30.817584991455078,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 2.0115,
+ "step": 18
+ },
+ {
+ "epoch": 0.2626728110599078,
+ "grad_norm": 29.683996200561523,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 1.8668,
+ "step": 19
+ },
+ {
+ "epoch": 0.2764976958525346,
+ "grad_norm": 29.506683349609375,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.7796,
+ "step": 20
+ },
+ {
+ "epoch": 0.2903225806451613,
+ "grad_norm": 27.55340003967285,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 1.5656,
+ "step": 21
+ },
+ {
+ "epoch": 0.30414746543778803,
+ "grad_norm": 27.78036880493164,
+ "learning_rate": 1.1e-06,
+ "loss": 1.5112,
+ "step": 22
+ },
+ {
+ "epoch": 0.31797235023041476,
+ "grad_norm": 26.36115264892578,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 1.3283,
+ "step": 23
+ },
+ {
+ "epoch": 0.3317972350230415,
+ "grad_norm": 25.388761520385742,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 1.137,
+ "step": 24
+ },
+ {
+ "epoch": 0.3456221198156682,
+ "grad_norm": 25.21432876586914,
+ "learning_rate": 1.25e-06,
+ "loss": 0.9867,
+ "step": 25
+ },
+ {
+ "epoch": 0.35944700460829493,
+ "grad_norm": 24.924489974975586,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7122,
+ "step": 26
+ },
+ {
+ "epoch": 0.37327188940092165,
+ "grad_norm": 21.881420135498047,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.4952,
+ "step": 27
+ },
+ {
+ "epoch": 0.3870967741935484,
+ "grad_norm": 17.67154884338379,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.3602,
+ "step": 28
+ },
+ {
+ "epoch": 0.4009216589861751,
+ "grad_norm": 11.489490509033203,
+ "learning_rate": 1.45e-06,
+ "loss": 0.2432,
+ "step": 29
+ },
+ {
+ "epoch": 0.4147465437788018,
+ "grad_norm": 7.622438907623291,
+ "learning_rate": 1.5e-06,
+ "loss": 0.189,
+ "step": 30
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 4.340638637542725,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.1302,
+ "step": 31
+ },
+ {
+ "epoch": 0.4423963133640553,
+ "grad_norm": 3.079514980316162,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.1075,
+ "step": 32
+ },
+ {
+ "epoch": 0.45622119815668205,
+ "grad_norm": 2.355943441390991,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.0998,
+ "step": 33
+ },
+ {
+ "epoch": 0.4700460829493088,
+ "grad_norm": 1.9480725526809692,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.0926,
+ "step": 34
+ },
+ {
+ "epoch": 0.4838709677419355,
+ "grad_norm": 1.8598166704177856,
+ "learning_rate": 1.75e-06,
+ "loss": 0.0733,
+ "step": 35
+ },
+ {
+ "epoch": 0.4976958525345622,
+ "grad_norm": 0.9892730712890625,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.0664,
+ "step": 36
+ },
+ {
+ "epoch": 0.511520737327189,
+ "grad_norm": 0.8992418050765991,
+ "learning_rate": 1.85e-06,
+ "loss": 0.0709,
+ "step": 37
+ },
+ {
+ "epoch": 0.5253456221198156,
+ "grad_norm": 0.7340101599693298,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.0535,
+ "step": 38
+ },
+ {
+ "epoch": 0.5391705069124424,
+ "grad_norm": 0.7032178044319153,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.0573,
+ "step": 39
+ },
+ {
+ "epoch": 0.5529953917050692,
+ "grad_norm": 0.6449429392814636,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.0576,
+ "step": 40
+ },
+ {
+ "epoch": 0.5668202764976958,
+ "grad_norm": 0.6358592510223389,
+ "learning_rate": 2.05e-06,
+ "loss": 0.0502,
+ "step": 41
+ },
+ {
+ "epoch": 0.5806451612903226,
+ "grad_norm": 0.572036623954773,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.0556,
+ "step": 42
+ },
+ {
+ "epoch": 0.5944700460829493,
+ "grad_norm": 0.6538863778114319,
+ "learning_rate": 2.15e-06,
+ "loss": 0.0556,
+ "step": 43
+ },
+ {
+ "epoch": 0.6082949308755761,
+ "grad_norm": 0.3532159626483917,
+ "learning_rate": 2.2e-06,
+ "loss": 0.0452,
+ "step": 44
+ },
+ {
+ "epoch": 0.6221198156682027,
+ "grad_norm": 0.4853012263774872,
+ "learning_rate": 2.25e-06,
+ "loss": 0.0471,
+ "step": 45
+ },
+ {
+ "epoch": 0.6359447004608295,
+ "grad_norm": 0.4761648178100586,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.0469,
+ "step": 46
+ },
+ {
+ "epoch": 0.6497695852534562,
+ "grad_norm": 0.6094638109207153,
+ "learning_rate": 2.35e-06,
+ "loss": 0.047,
+ "step": 47
+ },
+ {
+ "epoch": 0.663594470046083,
+ "grad_norm": 0.5211306214332581,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.0402,
+ "step": 48
+ },
+ {
+ "epoch": 0.6774193548387096,
+ "grad_norm": 0.2997778356075287,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.0425,
+ "step": 49
+ },
+ {
+ "epoch": 0.6912442396313364,
+ "grad_norm": 0.37834689021110535,
+ "learning_rate": 2.5e-06,
+ "loss": 0.0446,
+ "step": 50
+ },
+ {
+ "epoch": 0.7050691244239631,
+ "grad_norm": 0.31011995673179626,
+ "learning_rate": 2.55e-06,
+ "loss": 0.0406,
+ "step": 51
+ },
+ {
+ "epoch": 0.7188940092165899,
+ "grad_norm": 0.3113131523132324,
+ "learning_rate": 2.6e-06,
+ "loss": 0.0368,
+ "step": 52
+ },
+ {
+ "epoch": 0.7327188940092166,
+ "grad_norm": 0.5685846209526062,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.0389,
+ "step": 53
+ },
+ {
+ "epoch": 0.7465437788018433,
+ "grad_norm": 0.29334983229637146,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.0423,
+ "step": 54
+ },
+ {
+ "epoch": 0.7603686635944701,
+ "grad_norm": 0.5776861906051636,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.0399,
+ "step": 55
+ },
+ {
+ "epoch": 0.7741935483870968,
+ "grad_norm": 0.35423165559768677,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.0357,
+ "step": 56
+ },
+ {
+ "epoch": 0.7880184331797235,
+ "grad_norm": 0.37902742624282837,
+ "learning_rate": 2.85e-06,
+ "loss": 0.0407,
+ "step": 57
+ },
+ {
+ "epoch": 0.8018433179723502,
+ "grad_norm": 0.26948878169059753,
+ "learning_rate": 2.9e-06,
+ "loss": 0.0351,
+ "step": 58
+ },
+ {
+ "epoch": 0.815668202764977,
+ "grad_norm": 0.35688117146492004,
+ "learning_rate": 2.95e-06,
+ "loss": 0.0377,
+ "step": 59
+ },
+ {
+ "epoch": 0.8294930875576036,
+ "grad_norm": 0.5287911891937256,
+ "learning_rate": 3e-06,
+ "loss": 0.0377,
+ "step": 60
+ },
+ {
+ "epoch": 0.8433179723502304,
+ "grad_norm": 0.2950785756111145,
+ "learning_rate": 3.05e-06,
+ "loss": 0.0361,
+ "step": 61
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 0.2789723575115204,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.032,
+ "step": 62
+ },
+ {
+ "epoch": 0.8709677419354839,
+ "grad_norm": 0.2802198529243469,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.0394,
+ "step": 63
+ },
+ {
+ "epoch": 0.8847926267281107,
+ "grad_norm": 0.286981463432312,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.033,
+ "step": 64
+ },
+ {
+ "epoch": 0.8986175115207373,
+ "grad_norm": 0.37392762303352356,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.0335,
+ "step": 65
+ },
+ {
+ "epoch": 0.9124423963133641,
+ "grad_norm": 0.25025588274002075,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.0311,
+ "step": 66
+ },
+ {
+ "epoch": 0.9262672811059908,
+ "grad_norm": 0.4292861521244049,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.0362,
+ "step": 67
+ },
+ {
+ "epoch": 0.9400921658986175,
+ "grad_norm": 0.4717651307582855,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.0303,
+ "step": 68
+ },
+ {
+ "epoch": 0.9539170506912442,
+ "grad_norm": 0.49291253089904785,
+ "learning_rate": 3.45e-06,
+ "loss": 0.0352,
+ "step": 69
+ },
+ {
+ "epoch": 0.967741935483871,
+ "grad_norm": 0.3729935586452484,
+ "learning_rate": 3.5e-06,
+ "loss": 0.0297,
+ "step": 70
+ },
+ {
+ "epoch": 0.9815668202764977,
+ "grad_norm": 0.27150583267211914,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.0326,
+ "step": 71
+ },
+ {
+ "epoch": 0.9953917050691244,
+ "grad_norm": 0.34516096115112305,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.0336,
+ "step": 72
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 0.34516096115112305,
+ "learning_rate": 3.65e-06,
+ "loss": 0.0274,
+ "step": 73
+ },
+ {
+ "epoch": 1.0138248847926268,
+ "grad_norm": 0.6282734870910645,
+ "learning_rate": 3.7e-06,
+ "loss": 0.0289,
+ "step": 74
+ },
+ {
+ "epoch": 1.0276497695852536,
+ "grad_norm": 0.2935558557510376,
+ "learning_rate": 3.7500000000000005e-06,
+ "loss": 0.0308,
+ "step": 75
+ },
+ {
+ "epoch": 1.0414746543778801,
+ "grad_norm": 0.3166769742965698,
+ "learning_rate": 3.8000000000000005e-06,
+ "loss": 0.0277,
+ "step": 76
+ },
+ {
+ "epoch": 1.055299539170507,
+ "grad_norm": 0.38190239667892456,
+ "learning_rate": 3.85e-06,
+ "loss": 0.0338,
+ "step": 77
+ },
+ {
+ "epoch": 1.0691244239631337,
+ "grad_norm": 0.2779421806335449,
+ "learning_rate": 3.900000000000001e-06,
+ "loss": 0.03,
+ "step": 78
+ },
+ {
+ "epoch": 1.0829493087557605,
+ "grad_norm": 0.4055996537208557,
+ "learning_rate": 3.95e-06,
+ "loss": 0.0295,
+ "step": 79
+ },
+ {
+ "epoch": 1.096774193548387,
+ "grad_norm": 0.2987312972545624,
+ "learning_rate": 4.000000000000001e-06,
+ "loss": 0.028,
+ "step": 80
+ },
+ {
+ "epoch": 1.1105990783410138,
+ "grad_norm": 0.2674776017665863,
+ "learning_rate": 4.05e-06,
+ "loss": 0.0243,
+ "step": 81
+ },
+ {
+ "epoch": 1.1244239631336406,
+ "grad_norm": 0.29042816162109375,
+ "learning_rate": 4.1e-06,
+ "loss": 0.0318,
+ "step": 82
+ },
+ {
+ "epoch": 1.1382488479262673,
+ "grad_norm": 0.2904883027076721,
+ "learning_rate": 4.15e-06,
+ "loss": 0.0257,
+ "step": 83
+ },
+ {
+ "epoch": 1.1520737327188941,
+ "grad_norm": 0.30603015422821045,
+ "learning_rate": 4.2000000000000004e-06,
+ "loss": 0.0284,
+ "step": 84
+ },
+ {
+ "epoch": 1.1658986175115207,
+ "grad_norm": 0.23131045699119568,
+ "learning_rate": 4.25e-06,
+ "loss": 0.0285,
+ "step": 85
+ },
+ {
+ "epoch": 1.1797235023041475,
+ "grad_norm": 0.26788002252578735,
+ "learning_rate": 4.3e-06,
+ "loss": 0.0269,
+ "step": 86
+ },
+ {
+ "epoch": 1.1935483870967742,
+ "grad_norm": 0.2639651894569397,
+ "learning_rate": 4.350000000000001e-06,
+ "loss": 0.0289,
+ "step": 87
+ },
+ {
+ "epoch": 1.2073732718894008,
+ "grad_norm": 0.25068584084510803,
+ "learning_rate": 4.4e-06,
+ "loss": 0.0275,
+ "step": 88
+ },
+ {
+ "epoch": 1.2211981566820276,
+ "grad_norm": 0.25494542717933655,
+ "learning_rate": 4.450000000000001e-06,
+ "loss": 0.0275,
+ "step": 89
+ },
+ {
+ "epoch": 1.2350230414746544,
+ "grad_norm": 0.31125035881996155,
+ "learning_rate": 4.5e-06,
+ "loss": 0.0251,
+ "step": 90
+ },
+ {
+ "epoch": 1.2488479262672811,
+ "grad_norm": 0.2691773474216461,
+ "learning_rate": 4.5500000000000005e-06,
+ "loss": 0.0267,
+ "step": 91
+ },
+ {
+ "epoch": 1.262672811059908,
+ "grad_norm": 0.20079147815704346,
+ "learning_rate": 4.600000000000001e-06,
+ "loss": 0.0263,
+ "step": 92
+ },
+ {
+ "epoch": 1.2764976958525347,
+ "grad_norm": 0.28027331829071045,
+ "learning_rate": 4.65e-06,
+ "loss": 0.0227,
+ "step": 93
+ },
+ {
+ "epoch": 1.2903225806451613,
+ "grad_norm": 0.40053099393844604,
+ "learning_rate": 4.7e-06,
+ "loss": 0.0246,
+ "step": 94
+ },
+ {
+ "epoch": 1.304147465437788,
+ "grad_norm": 0.33066362142562866,
+ "learning_rate": 4.75e-06,
+ "loss": 0.0221,
+ "step": 95
+ },
+ {
+ "epoch": 1.3179723502304148,
+ "grad_norm": 0.2531339228153229,
+ "learning_rate": 4.800000000000001e-06,
+ "loss": 0.0216,
+ "step": 96
+ },
+ {
+ "epoch": 1.3317972350230414,
+ "grad_norm": 0.37544378638267517,
+ "learning_rate": 4.85e-06,
+ "loss": 0.0247,
+ "step": 97
+ },
+ {
+ "epoch": 1.3456221198156681,
+ "grad_norm": 0.34273672103881836,
+ "learning_rate": 4.9000000000000005e-06,
+ "loss": 0.0217,
+ "step": 98
+ },
+ {
+ "epoch": 1.359447004608295,
+ "grad_norm": 0.2338661253452301,
+ "learning_rate": 4.95e-06,
+ "loss": 0.0237,
+ "step": 99
+ },
+ {
+ "epoch": 1.3732718894009217,
+ "grad_norm": 0.30151981115341187,
+ "learning_rate": 5e-06,
+ "loss": 0.0248,
+ "step": 100
+ },
+ {
+ "epoch": 1.3870967741935485,
+ "grad_norm": 0.3205336630344391,
+ "learning_rate": 4.999888074163108e-06,
+ "loss": 0.0232,
+ "step": 101
+ },
+ {
+ "epoch": 1.400921658986175,
+ "grad_norm": 0.2705315351486206,
+ "learning_rate": 4.999552306674345e-06,
+ "loss": 0.0245,
+ "step": 102
+ },
+ {
+ "epoch": 1.4147465437788018,
+ "grad_norm": 0.2564137578010559,
+ "learning_rate": 4.998992727598557e-06,
+ "loss": 0.0274,
+ "step": 103
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.1967611312866211,
+ "learning_rate": 4.998209387040829e-06,
+ "loss": 0.0173,
+ "step": 104
+ },
+ {
+ "epoch": 1.4423963133640554,
+ "grad_norm": 0.2568240761756897,
+ "learning_rate": 4.9972023551419995e-06,
+ "loss": 0.0223,
+ "step": 105
+ },
+ {
+ "epoch": 1.456221198156682,
+ "grad_norm": 0.2236352413892746,
+ "learning_rate": 4.995971722072379e-06,
+ "loss": 0.0202,
+ "step": 106
+ },
+ {
+ "epoch": 1.4700460829493087,
+ "grad_norm": 0.3389627933502197,
+ "learning_rate": 4.9945175980236745e-06,
+ "loss": 0.0214,
+ "step": 107
+ },
+ {
+ "epoch": 1.4838709677419355,
+ "grad_norm": 0.31428012251853943,
+ "learning_rate": 4.992840113199131e-06,
+ "loss": 0.0188,
+ "step": 108
+ },
+ {
+ "epoch": 1.4976958525345623,
+ "grad_norm": 0.41508516669273376,
+ "learning_rate": 4.990939417801859e-06,
+ "loss": 0.0213,
+ "step": 109
+ },
+ {
+ "epoch": 1.511520737327189,
+ "grad_norm": 0.19615545868873596,
+ "learning_rate": 4.988815682021398e-06,
+ "loss": 0.0191,
+ "step": 110
+ },
+ {
+ "epoch": 1.5253456221198156,
+ "grad_norm": 0.2059931755065918,
+ "learning_rate": 4.986469096018472e-06,
+ "loss": 0.0208,
+ "step": 111
+ },
+ {
+ "epoch": 1.5391705069124424,
+ "grad_norm": 0.26946336030960083,
+ "learning_rate": 4.983899869907963e-06,
+ "loss": 0.0192,
+ "step": 112
+ },
+ {
+ "epoch": 1.5529953917050692,
+ "grad_norm": 0.3227538466453552,
+ "learning_rate": 4.981108233740096e-06,
+ "loss": 0.0169,
+ "step": 113
+ },
+ {
+ "epoch": 1.5668202764976957,
+ "grad_norm": 0.2811918258666992,
+ "learning_rate": 4.978094437479843e-06,
+ "loss": 0.0151,
+ "step": 114
+ },
+ {
+ "epoch": 1.5806451612903225,
+ "grad_norm": 0.32980477809906006,
+ "learning_rate": 4.97485875098454e-06,
+ "loss": 0.0182,
+ "step": 115
+ },
+ {
+ "epoch": 1.5944700460829493,
+ "grad_norm": 0.2759259045124054,
+ "learning_rate": 4.971401463979722e-06,
+ "loss": 0.0192,
+ "step": 116
+ },
+ {
+ "epoch": 1.608294930875576,
+ "grad_norm": 0.2572178840637207,
+ "learning_rate": 4.967722886033181e-06,
+ "loss": 0.0198,
+ "step": 117
+ },
+ {
+ "epoch": 1.6221198156682028,
+ "grad_norm": 0.3238658905029297,
+ "learning_rate": 4.963823346527249e-06,
+ "loss": 0.0186,
+ "step": 118
+ },
+ {
+ "epoch": 1.6359447004608296,
+ "grad_norm": 0.3834918737411499,
+ "learning_rate": 4.959703194629304e-06,
+ "loss": 0.0188,
+ "step": 119
+ },
+ {
+ "epoch": 1.6497695852534562,
+ "grad_norm": 0.23881244659423828,
+ "learning_rate": 4.955362799260507e-06,
+ "loss": 0.0182,
+ "step": 120
+ },
+ {
+ "epoch": 1.663594470046083,
+ "grad_norm": 0.1885918825864792,
+ "learning_rate": 4.950802549062764e-06,
+ "loss": 0.0183,
+ "step": 121
+ },
+ {
+ "epoch": 1.6774193548387095,
+ "grad_norm": 0.34959614276885986,
+ "learning_rate": 4.946022852363932e-06,
+ "loss": 0.0173,
+ "step": 122
+ },
+ {
+ "epoch": 1.6912442396313363,
+ "grad_norm": 0.22990310192108154,
+ "learning_rate": 4.9410241371412525e-06,
+ "loss": 0.0135,
+ "step": 123
+ },
+ {
+ "epoch": 1.705069124423963,
+ "grad_norm": 0.2790350615978241,
+ "learning_rate": 4.935806850983034e-06,
+ "loss": 0.0159,
+ "step": 124
+ },
+ {
+ "epoch": 1.7188940092165899,
+ "grad_norm": 0.3218020796775818,
+ "learning_rate": 4.9303714610485705e-06,
+ "loss": 0.0176,
+ "step": 125
+ },
+ {
+ "epoch": 1.7327188940092166,
+ "grad_norm": 0.2294609695672989,
+ "learning_rate": 4.924718454026318e-06,
+ "loss": 0.0149,
+ "step": 126
+ },
+ {
+ "epoch": 1.7465437788018434,
+ "grad_norm": 0.3427927494049072,
+ "learning_rate": 4.918848336090309e-06,
+ "loss": 0.0165,
+ "step": 127
+ },
+ {
+ "epoch": 1.7603686635944702,
+ "grad_norm": 0.22731825709342957,
+ "learning_rate": 4.912761632854834e-06,
+ "loss": 0.0145,
+ "step": 128
+ },
+ {
+ "epoch": 1.7741935483870968,
+ "grad_norm": 0.35364386439323425,
+ "learning_rate": 4.906458889327375e-06,
+ "loss": 0.0161,
+ "step": 129
+ },
+ {
+ "epoch": 1.7880184331797235,
+ "grad_norm": 0.29476454854011536,
+ "learning_rate": 4.899940669859807e-06,
+ "loss": 0.0154,
+ "step": 130
+ },
+ {
+ "epoch": 1.80184331797235,
+ "grad_norm": 0.28667864203453064,
+ "learning_rate": 4.893207558097867e-06,
+ "loss": 0.0143,
+ "step": 131
+ },
+ {
+ "epoch": 1.8156682027649769,
+ "grad_norm": 0.2731999158859253,
+ "learning_rate": 4.8862601569288885e-06,
+ "loss": 0.0141,
+ "step": 132
+ },
+ {
+ "epoch": 1.8294930875576036,
+ "grad_norm": 0.2670470178127289,
+ "learning_rate": 4.879099088427824e-06,
+ "loss": 0.0131,
+ "step": 133
+ },
+ {
+ "epoch": 1.8433179723502304,
+ "grad_norm": 0.23313525319099426,
+ "learning_rate": 4.871724993801541e-06,
+ "loss": 0.012,
+ "step": 134
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 0.2192607820034027,
+ "learning_rate": 4.864138533331411e-06,
+ "loss": 0.0125,
+ "step": 135
+ },
+ {
+ "epoch": 1.870967741935484,
+ "grad_norm": 0.26603585481643677,
+ "learning_rate": 4.8563403863141825e-06,
+ "loss": 0.0121,
+ "step": 136
+ },
+ {
+ "epoch": 1.8847926267281108,
+ "grad_norm": 0.32500001788139343,
+ "learning_rate": 4.84833125100116e-06,
+ "loss": 0.0116,
+ "step": 137
+ },
+ {
+ "epoch": 1.8986175115207373,
+ "grad_norm": 0.24893291294574738,
+ "learning_rate": 4.840111844535682e-06,
+ "loss": 0.0119,
+ "step": 138
+ },
+ {
+ "epoch": 1.912442396313364,
+ "grad_norm": 0.17670764029026031,
+ "learning_rate": 4.8316829028889076e-06,
+ "loss": 0.0096,
+ "step": 139
+ },
+ {
+ "epoch": 1.9262672811059907,
+ "grad_norm": 0.16747575998306274,
+ "learning_rate": 4.823045180793914e-06,
+ "loss": 0.0113,
+ "step": 140
+ },
+ {
+ "epoch": 1.9400921658986174,
+ "grad_norm": 0.19587458670139313,
+ "learning_rate": 4.8141994516781196e-06,
+ "loss": 0.0111,
+ "step": 141
+ },
+ {
+ "epoch": 1.9539170506912442,
+ "grad_norm": 0.237543985247612,
+ "learning_rate": 4.805146507594034e-06,
+ "loss": 0.0088,
+ "step": 142
+ },
+ {
+ "epoch": 1.967741935483871,
+ "grad_norm": 0.22710399329662323,
+ "learning_rate": 4.7958871591483305e-06,
+ "loss": 0.0085,
+ "step": 143
+ },
+ {
+ "epoch": 1.9815668202764978,
+ "grad_norm": 0.2946629822254181,
+ "learning_rate": 4.786422235429269e-06,
+ "loss": 0.0122,
+ "step": 144
+ },
+ {
+ "epoch": 1.9953917050691246,
+ "grad_norm": 0.2763853371143341,
+ "learning_rate": 4.776752583932455e-06,
+ "loss": 0.0118,
+ "step": 145
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.2763853371143341,
+ "learning_rate": 4.766879070484957e-06,
+ "loss": 0.0078,
+ "step": 146
+ },
+ {
+ "epoch": 2.013824884792627,
+ "grad_norm": 0.2722196877002716,
+ "learning_rate": 4.756802579167781e-06,
+ "loss": 0.0076,
+ "step": 147
+ },
+ {
+ "epoch": 2.0276497695852536,
+ "grad_norm": 0.18556565046310425,
+ "learning_rate": 4.746524012236706e-06,
+ "loss": 0.0091,
+ "step": 148
+ },
+ {
+ "epoch": 2.0414746543778803,
+ "grad_norm": 0.24442361295223236,
+ "learning_rate": 4.736044290041496e-06,
+ "loss": 0.009,
+ "step": 149
+ },
+ {
+ "epoch": 2.055299539170507,
+ "grad_norm": 0.24207571148872375,
+ "learning_rate": 4.725364350943492e-06,
+ "loss": 0.0085,
+ "step": 150
+ },
+ {
+ "epoch": 2.0691244239631335,
+ "grad_norm": 0.18502290546894073,
+ "learning_rate": 4.714485151231593e-06,
+ "loss": 0.0059,
+ "step": 151
+ },
+ {
+ "epoch": 2.0829493087557602,
+ "grad_norm": 0.3010450303554535,
+ "learning_rate": 4.703407665036622e-06,
+ "loss": 0.0071,
+ "step": 152
+ },
+ {
+ "epoch": 2.096774193548387,
+ "grad_norm": 0.23272967338562012,
+ "learning_rate": 4.692132884244113e-06,
+ "loss": 0.0074,
+ "step": 153
+ },
+ {
+ "epoch": 2.110599078341014,
+ "grad_norm": 0.25476181507110596,
+ "learning_rate": 4.680661818405485e-06,
+ "loss": 0.0082,
+ "step": 154
+ },
+ {
+ "epoch": 2.1244239631336406,
+ "grad_norm": 0.24534538388252258,
+ "learning_rate": 4.668995494647653e-06,
+ "loss": 0.0065,
+ "step": 155
+ },
+ {
+ "epoch": 2.1382488479262673,
+ "grad_norm": 0.1642732173204422,
+ "learning_rate": 4.657134957581057e-06,
+ "loss": 0.0054,
+ "step": 156
+ },
+ {
+ "epoch": 2.152073732718894,
+ "grad_norm": 0.21100501716136932,
+ "learning_rate": 4.645081269206128e-06,
+ "loss": 0.0091,
+ "step": 157
+ },
+ {
+ "epoch": 2.165898617511521,
+ "grad_norm": 0.19043587148189545,
+ "learning_rate": 4.632835508818192e-06,
+ "loss": 0.0047,
+ "step": 158
+ },
+ {
+ "epoch": 2.1797235023041477,
+ "grad_norm": 0.1804375797510147,
+ "learning_rate": 4.620398772910833e-06,
+ "loss": 0.0068,
+ "step": 159
+ },
+ {
+ "epoch": 2.193548387096774,
+ "grad_norm": 0.6586657762527466,
+ "learning_rate": 4.607772175077712e-06,
+ "loss": 0.0049,
+ "step": 160
+ },
+ {
+ "epoch": 2.207373271889401,
+ "grad_norm": 0.18181656301021576,
+ "learning_rate": 4.59495684591285e-06,
+ "loss": 0.0071,
+ "step": 161
+ },
+ {
+ "epoch": 2.2211981566820276,
+ "grad_norm": 0.760053813457489,
+ "learning_rate": 4.581953932909403e-06,
+ "loss": 0.0065,
+ "step": 162
+ },
+ {
+ "epoch": 2.2350230414746544,
+ "grad_norm": 0.1935238242149353,
+ "learning_rate": 4.5687646003569055e-06,
+ "loss": 0.0066,
+ "step": 163
+ },
+ {
+ "epoch": 2.248847926267281,
+ "grad_norm": 0.3035024404525757,
+ "learning_rate": 4.555390029237026e-06,
+ "loss": 0.0046,
+ "step": 164
+ },
+ {
+ "epoch": 2.262672811059908,
+ "grad_norm": 0.16596420109272003,
+ "learning_rate": 4.541831417117815e-06,
+ "loss": 0.007,
+ "step": 165
+ },
+ {
+ "epoch": 2.2764976958525347,
+ "grad_norm": 0.2578873336315155,
+ "learning_rate": 4.528089978046481e-06,
+ "loss": 0.0048,
+ "step": 166
+ },
+ {
+ "epoch": 2.2903225806451615,
+ "grad_norm": 1.7751781940460205,
+ "learning_rate": 4.514166942440679e-06,
+ "loss": 0.0041,
+ "step": 167
+ },
+ {
+ "epoch": 2.3041474654377883,
+ "grad_norm": 0.37872445583343506,
+ "learning_rate": 4.5000635569783365e-06,
+ "loss": 0.0045,
+ "step": 168
+ },
+ {
+ "epoch": 2.3179723502304146,
+ "grad_norm": 0.22949594259262085,
+ "learning_rate": 4.4857810844860325e-06,
+ "loss": 0.0071,
+ "step": 169
+ },
+ {
+ "epoch": 2.3317972350230414,
+ "grad_norm": 0.34662699699401855,
+ "learning_rate": 4.471320803825915e-06,
+ "loss": 0.006,
+ "step": 170
+ },
+ {
+ "epoch": 2.345622119815668,
+ "grad_norm": 0.5892661213874817,
+ "learning_rate": 4.4566840097811956e-06,
+ "loss": 0.0055,
+ "step": 171
+ },
+ {
+ "epoch": 2.359447004608295,
+ "grad_norm": 0.18866907060146332,
+ "learning_rate": 4.4418720129402145e-06,
+ "loss": 0.0036,
+ "step": 172
+ },
+ {
+ "epoch": 2.3732718894009217,
+ "grad_norm": 0.1510942429304123,
+ "learning_rate": 4.426886139579083e-06,
+ "loss": 0.0065,
+ "step": 173
+ },
+ {
+ "epoch": 2.3870967741935485,
+ "grad_norm": 0.21291828155517578,
+ "learning_rate": 4.411727731542937e-06,
+ "loss": 0.004,
+ "step": 174
+ },
+ {
+ "epoch": 2.4009216589861753,
+ "grad_norm": 0.18649035692214966,
+ "learning_rate": 4.39639814612578e-06,
+ "loss": 0.0047,
+ "step": 175
+ },
+ {
+ "epoch": 2.4147465437788016,
+ "grad_norm": 0.19008278846740723,
+ "learning_rate": 4.3808987559489536e-06,
+ "loss": 0.0071,
+ "step": 176
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.26282456517219543,
+ "learning_rate": 4.365230948838232e-06,
+ "loss": 0.0044,
+ "step": 177
+ },
+ {
+ "epoch": 2.442396313364055,
+ "grad_norm": 0.2351403385400772,
+ "learning_rate": 4.349396127699552e-06,
+ "loss": 0.0057,
+ "step": 178
+ },
+ {
+ "epoch": 2.456221198156682,
+ "grad_norm": 0.20451441407203674,
+ "learning_rate": 4.3333957103934025e-06,
+ "loss": 0.003,
+ "step": 179
+ },
+ {
+ "epoch": 2.4700460829493087,
+ "grad_norm": 0.22120380401611328,
+ "learning_rate": 4.317231129607859e-06,
+ "loss": 0.0045,
+ "step": 180
+ },
+ {
+ "epoch": 2.4838709677419355,
+ "grad_norm": 0.18543967604637146,
+ "learning_rate": 4.30090383273031e-06,
+ "loss": 0.0062,
+ "step": 181
+ },
+ {
+ "epoch": 2.4976958525345623,
+ "grad_norm": 0.18473730981349945,
+ "learning_rate": 4.2844152817178476e-06,
+ "loss": 0.0052,
+ "step": 182
+ },
+ {
+ "epoch": 2.511520737327189,
+ "grad_norm": 0.21087361872196198,
+ "learning_rate": 4.267766952966369e-06,
+ "loss": 0.0041,
+ "step": 183
+ },
+ {
+ "epoch": 2.525345622119816,
+ "grad_norm": 0.24977360665798187,
+ "learning_rate": 4.2509603371783776e-06,
+ "loss": 0.0053,
+ "step": 184
+ },
+ {
+ "epoch": 2.539170506912442,
+ "grad_norm": 0.19377018511295319,
+ "learning_rate": 4.233996939229502e-06,
+ "loss": 0.0037,
+ "step": 185
+ },
+ {
+ "epoch": 2.5529953917050694,
+ "grad_norm": 0.21130548417568207,
+ "learning_rate": 4.216878278033753e-06,
+ "loss": 0.0031,
+ "step": 186
+ },
+ {
+ "epoch": 2.5668202764976957,
+ "grad_norm": 0.13288047909736633,
+ "learning_rate": 4.199605886407515e-06,
+ "loss": 0.0029,
+ "step": 187
+ },
+ {
+ "epoch": 2.5806451612903225,
+ "grad_norm": 0.15998876094818115,
+ "learning_rate": 4.1821813109322975e-06,
+ "loss": 0.0031,
+ "step": 188
+ },
+ {
+ "epoch": 2.5944700460829493,
+ "grad_norm": 0.19475506246089935,
+ "learning_rate": 4.164606111816256e-06,
+ "loss": 0.0022,
+ "step": 189
+ },
+ {
+ "epoch": 2.608294930875576,
+ "grad_norm": 0.1446300446987152,
+ "learning_rate": 4.146881862754485e-06,
+ "loss": 0.0025,
+ "step": 190
+ },
+ {
+ "epoch": 2.622119815668203,
+ "grad_norm": 0.13051164150238037,
+ "learning_rate": 4.129010150788112e-06,
+ "loss": 0.0019,
+ "step": 191
+ },
+ {
+ "epoch": 2.6359447004608296,
+ "grad_norm": 0.1953984946012497,
+ "learning_rate": 4.110992576162193e-06,
+ "loss": 0.0021,
+ "step": 192
+ },
+ {
+ "epoch": 2.6497695852534564,
+ "grad_norm": 0.23630598187446594,
+ "learning_rate": 4.092830752182423e-06,
+ "loss": 0.002,
+ "step": 193
+ },
+ {
+ "epoch": 2.6635944700460827,
+ "grad_norm": 0.2919062376022339,
+ "learning_rate": 4.074526305070679e-06,
+ "loss": 0.0017,
+ "step": 194
+ },
+ {
+ "epoch": 2.6774193548387095,
+ "grad_norm": 0.22015534341335297,
+ "learning_rate": 4.056080873819412e-06,
+ "loss": 0.0025,
+ "step": 195
+ },
+ {
+ "epoch": 2.6912442396313363,
+ "grad_norm": 0.9449160099029541,
+ "learning_rate": 4.037496110044885e-06,
+ "loss": 0.0024,
+ "step": 196
+ },
+ {
+ "epoch": 2.705069124423963,
+ "grad_norm": 0.25235581398010254,
+ "learning_rate": 4.018773677839289e-06,
+ "loss": 0.0031,
+ "step": 197
+ },
+ {
+ "epoch": 2.71889400921659,
+ "grad_norm": 0.3098089098930359,
+ "learning_rate": 3.999915253621739e-06,
+ "loss": 0.0019,
+ "step": 198
+ },
+ {
+ "epoch": 2.7327188940092166,
+ "grad_norm": 0.19896291196346283,
+ "learning_rate": 3.980922525988167e-06,
+ "loss": 0.0019,
+ "step": 199
+ },
+ {
+ "epoch": 2.7465437788018434,
+ "grad_norm": 0.21136268973350525,
+ "learning_rate": 3.961797195560118e-06,
+ "loss": 0.0031,
+ "step": 200
+ },
+ {
+ "epoch": 2.76036866359447,
+ "grad_norm": 0.2549005150794983,
+ "learning_rate": 3.942540974832486e-06,
+ "loss": 0.0017,
+ "step": 201
+ },
+ {
+ "epoch": 2.774193548387097,
+ "grad_norm": 0.14762410521507263,
+ "learning_rate": 3.9231555880201655e-06,
+ "loss": 0.0022,
+ "step": 202
+ },
+ {
+ "epoch": 2.7880184331797233,
+ "grad_norm": 0.16235944628715515,
+ "learning_rate": 3.903642770903671e-06,
+ "loss": 0.0012,
+ "step": 203
+ },
+ {
+ "epoch": 2.80184331797235,
+ "grad_norm": 0.1506718099117279,
+ "learning_rate": 3.884004270673711e-06,
+ "loss": 0.0015,
+ "step": 204
+ },
+ {
+ "epoch": 2.815668202764977,
+ "grad_norm": 0.10484135895967484,
+ "learning_rate": 3.864241845774746e-06,
+ "loss": 0.0016,
+ "step": 205
+ },
+ {
+ "epoch": 2.8294930875576036,
+ "grad_norm": 0.7636306285858154,
+ "learning_rate": 3.844357265747531e-06,
+ "loss": 0.0018,
+ "step": 206
+ },
+ {
+ "epoch": 2.8433179723502304,
+ "grad_norm": 0.2242082804441452,
+ "learning_rate": 3.8243523110706736e-06,
+ "loss": 0.0021,
+ "step": 207
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.3264133334159851,
+ "learning_rate": 3.8042287730012117e-06,
+ "loss": 0.0021,
+ "step": 208
+ },
+ {
+ "epoch": 2.870967741935484,
+ "grad_norm": 0.12472204118967056,
+ "learning_rate": 3.7839884534142157e-06,
+ "loss": 0.0011,
+ "step": 209
+ },
+ {
+ "epoch": 2.8847926267281108,
+ "grad_norm": 0.07526414096355438,
+ "learning_rate": 3.7636331646414524e-06,
+ "loss": 0.0017,
+ "step": 210
+ },
+ {
+ "epoch": 2.8986175115207375,
+ "grad_norm": 0.16134843230247498,
+ "learning_rate": 3.7431647293091076e-06,
+ "loss": 0.0019,
+ "step": 211
+ },
+ {
+ "epoch": 2.912442396313364,
+ "grad_norm": 0.14789307117462158,
+ "learning_rate": 3.7225849801745835e-06,
+ "loss": 0.0012,
+ "step": 212
+ },
+ {
+ "epoch": 2.9262672811059907,
+ "grad_norm": 0.13681238889694214,
+ "learning_rate": 3.701895759962397e-06,
+ "loss": 0.0011,
+ "step": 213
+ },
+ {
+ "epoch": 2.9400921658986174,
+ "grad_norm": 0.10747735947370529,
+ "learning_rate": 3.6810989211991777e-06,
+ "loss": 0.0007,
+ "step": 214
+ },
+ {
+ "epoch": 2.953917050691244,
+ "grad_norm": 0.08121375739574432,
+ "learning_rate": 3.6601963260477923e-06,
+ "loss": 0.0005,
+ "step": 215
+ },
+ {
+ "epoch": 2.967741935483871,
+ "grad_norm": 0.0884300246834755,
+ "learning_rate": 3.6391898461406045e-06,
+ "loss": 0.0014,
+ "step": 216
+ },
+ {
+ "epoch": 2.9815668202764978,
+ "grad_norm": 0.18539245426654816,
+ "learning_rate": 3.6180813624118898e-06,
+ "loss": 0.002,
+ "step": 217
+ },
+ {
+ "epoch": 2.9953917050691246,
+ "grad_norm": 0.1257522702217102,
+ "learning_rate": 3.5968727649294134e-06,
+ "loss": 0.0015,
+ "step": 218
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 1.2422555685043335,
+ "learning_rate": 3.575565952725193e-06,
+ "loss": 0.0002,
+ "step": 219
+ },
+ {
+ "epoch": 3.013824884792627,
+ "grad_norm": 0.06009506434202194,
+ "learning_rate": 3.55416283362546e-06,
+ "loss": 0.0003,
+ "step": 220
+ },
+ {
+ "epoch": 3.0276497695852536,
+ "grad_norm": 0.0876953974366188,
+ "learning_rate": 3.5326653240798283e-06,
+ "loss": 0.0005,
+ "step": 221
+ },
+ {
+ "epoch": 3.0414746543778803,
+ "grad_norm": 0.7512914538383484,
+ "learning_rate": 3.5110753489896924e-06,
+ "loss": 0.0007,
+ "step": 222
+ },
+ {
+ "epoch": 3.055299539170507,
+ "grad_norm": 0.08451899141073227,
+ "learning_rate": 3.4893948415358803e-06,
+ "loss": 0.0009,
+ "step": 223
+ },
+ {
+ "epoch": 3.0691244239631335,
+ "grad_norm": 0.15445305407047272,
+ "learning_rate": 3.4676257430055438e-06,
+ "loss": 0.0006,
+ "step": 224
+ },
+ {
+ "epoch": 3.0829493087557602,
+ "grad_norm": 0.07909094542264938,
+ "learning_rate": 3.4457700026183378e-06,
+ "loss": 0.0004,
+ "step": 225
+ },
+ {
+ "epoch": 3.096774193548387,
+ "grad_norm": 0.03637247905135155,
+ "learning_rate": 3.4238295773518924e-06,
+ "loss": 0.0003,
+ "step": 226
+ },
+ {
+ "epoch": 3.110599078341014,
+ "grad_norm": 0.203308567404747,
+ "learning_rate": 3.4018064317665745e-06,
+ "loss": 0.0003,
+ "step": 227
+ },
+ {
+ "epoch": 3.1244239631336406,
+ "grad_norm": 0.03239201754331589,
+ "learning_rate": 3.3797025378295826e-06,
+ "loss": 0.0002,
+ "step": 228
+ },
+ {
+ "epoch": 3.1382488479262673,
+ "grad_norm": 0.07106538861989975,
+ "learning_rate": 3.357519874738382e-06,
+ "loss": 0.0004,
+ "step": 229
+ },
+ {
+ "epoch": 3.152073732718894,
+ "grad_norm": 0.048268985003232956,
+ "learning_rate": 3.3352604287434752e-06,
+ "loss": 0.0003,
+ "step": 230
+ },
+ {
+ "epoch": 3.165898617511521,
+ "grad_norm": 0.0841558575630188,
+ "learning_rate": 3.31292619297056e-06,
+ "loss": 0.0003,
+ "step": 231
+ },
+ {
+ "epoch": 3.1797235023041477,
+ "grad_norm": 0.07029678672552109,
+ "learning_rate": 3.29051916724206e-06,
+ "loss": 0.0003,
+ "step": 232
+ },
+ {
+ "epoch": 3.193548387096774,
+ "grad_norm": 0.11369964480400085,
+ "learning_rate": 3.2680413578980623e-06,
+ "loss": 0.0014,
+ "step": 233
+ },
+ {
+ "epoch": 3.207373271889401,
+ "grad_norm": 0.0367964468896389,
+ "learning_rate": 3.245494777616664e-06,
+ "loss": 0.0001,
+ "step": 234
+ },
+ {
+ "epoch": 3.2211981566820276,
+ "grad_norm": 0.13746097683906555,
+ "learning_rate": 3.2228814452337587e-06,
+ "loss": 0.0003,
+ "step": 235
+ },
+ {
+ "epoch": 3.2350230414746544,
+ "grad_norm": 0.09046189486980438,
+ "learning_rate": 3.2002033855622683e-06,
+ "loss": 0.0004,
+ "step": 236
+ },
+ {
+ "epoch": 3.248847926267281,
+ "grad_norm": 0.04587667062878609,
+ "learning_rate": 3.177462629210838e-06,
+ "loss": 0.0002,
+ "step": 237
+ },
+ {
+ "epoch": 3.262672811059908,
+ "grad_norm": 0.11323168128728867,
+ "learning_rate": 3.154661212402017e-06,
+ "loss": 0.0003,
+ "step": 238
+ },
+ {
+ "epoch": 3.2764976958525347,
+ "grad_norm": 0.04728177189826965,
+ "learning_rate": 3.131801176789934e-06,
+ "loss": 0.0002,
+ "step": 239
+ },
+ {
+ "epoch": 3.2903225806451615,
+ "grad_norm": 0.527999997138977,
+ "learning_rate": 3.1088845692774798e-06,
+ "loss": 0.0008,
+ "step": 240
+ },
+ {
+ "epoch": 3.3041474654377883,
+ "grad_norm": 0.026646027341485023,
+ "learning_rate": 3.0859134418330373e-06,
+ "loss": 0.0001,
+ "step": 241
+ },
+ {
+ "epoch": 3.3179723502304146,
+ "grad_norm": 0.057450197637081146,
+ "learning_rate": 3.0628898513067357e-06,
+ "loss": 0.0004,
+ "step": 242
+ },
+ {
+ "epoch": 3.3317972350230414,
+ "grad_norm": 0.08258494734764099,
+ "learning_rate": 3.0398158592462847e-06,
+ "loss": 0.0005,
+ "step": 243
+ },
+ {
+ "epoch": 3.345622119815668,
+ "grad_norm": 0.01878846250474453,
+ "learning_rate": 3.0166935317123824e-06,
+ "loss": 0.0001,
+ "step": 244
+ },
+ {
+ "epoch": 3.359447004608295,
+ "grad_norm": 0.041918545961380005,
+ "learning_rate": 2.9935249390937184e-06,
+ "loss": 0.0002,
+ "step": 245
+ },
+ {
+ "epoch": 3.3732718894009217,
+ "grad_norm": 0.04018491134047508,
+ "learning_rate": 2.970312155921585e-06,
+ "loss": 0.0002,
+ "step": 246
+ },
+ {
+ "epoch": 3.3870967741935485,
+ "grad_norm": 0.040825020521879196,
+ "learning_rate": 2.9470572606841295e-06,
+ "loss": 0.0002,
+ "step": 247
+ },
+ {
+ "epoch": 3.4009216589861753,
+ "grad_norm": 0.050590481609106064,
+ "learning_rate": 2.9237623356402423e-06,
+ "loss": 0.0002,
+ "step": 248
+ },
+ {
+ "epoch": 3.4147465437788016,
+ "grad_norm": 0.07999978959560394,
+ "learning_rate": 2.900429466633107e-06,
+ "loss": 0.0002,
+ "step": 249
+ },
+ {
+ "epoch": 3.4285714285714284,
+ "grad_norm": 0.02137935161590576,
+ "learning_rate": 2.8770607429034352e-06,
+ "loss": 0.0001,
+ "step": 250
+ },
+ {
+ "epoch": 3.442396313364055,
+ "grad_norm": 0.18967340886592865,
+ "learning_rate": 2.8536582569023964e-06,
+ "loss": 0.0007,
+ "step": 251
+ },
+ {
+ "epoch": 3.456221198156682,
+ "grad_norm": 0.03681226819753647,
+ "learning_rate": 2.8302241041042564e-06,
+ "loss": 0.0001,
+ "step": 252
+ },
+ {
+ "epoch": 3.4700460829493087,
+ "grad_norm": 0.03142761439085007,
+ "learning_rate": 2.8067603828187446e-06,
+ "loss": 0.0001,
+ "step": 253
+ },
+ {
+ "epoch": 3.4838709677419355,
+ "grad_norm": 0.11318890005350113,
+ "learning_rate": 2.7832691940031755e-06,
+ "loss": 0.0005,
+ "step": 254
+ },
+ {
+ "epoch": 3.4976958525345623,
+ "grad_norm": 0.047176819294691086,
+ "learning_rate": 2.759752641074322e-06,
+ "loss": 0.0002,
+ "step": 255
+ },
+ {
+ "epoch": 3.511520737327189,
+ "grad_norm": 0.0642286092042923,
+ "learning_rate": 2.7362128297200784e-06,
+ "loss": 0.0002,
+ "step": 256
+ },
+ {
+ "epoch": 3.525345622119816,
+ "grad_norm": 0.09328105300664902,
+ "learning_rate": 2.712651867710914e-06,
+ "loss": 0.0004,
+ "step": 257
+ },
+ {
+ "epoch": 3.539170506912442,
+ "grad_norm": 0.08150269836187363,
+ "learning_rate": 2.6890718647111424e-06,
+ "loss": 0.0007,
+ "step": 258
+ },
+ {
+ "epoch": 3.5529953917050694,
+ "grad_norm": 0.03366294875741005,
+ "learning_rate": 2.665474932090017e-06,
+ "loss": 0.0001,
+ "step": 259
+ },
+ {
+ "epoch": 3.5668202764976957,
+ "grad_norm": 0.032316725701093674,
+ "learning_rate": 2.6418631827326857e-06,
+ "loss": 0.0001,
+ "step": 260
+ },
+ {
+ "epoch": 3.5806451612903225,
+ "grad_norm": 0.02776617370545864,
+ "learning_rate": 2.6182387308509927e-06,
+ "loss": 0.0001,
+ "step": 261
+ },
+ {
+ "epoch": 3.5944700460829493,
+ "grad_norm": 0.1258484572172165,
+ "learning_rate": 2.5946036917941765e-06,
+ "loss": 0.0003,
+ "step": 262
+ },
+ {
+ "epoch": 3.608294930875576,
+ "grad_norm": 0.04412033408880234,
+ "learning_rate": 2.570960181859458e-06,
+ "loss": 0.0003,
+ "step": 263
+ },
+ {
+ "epoch": 3.622119815668203,
+ "grad_norm": 0.016816483810544014,
+ "learning_rate": 2.547310318102548e-06,
+ "loss": 0.0001,
+ "step": 264
+ },
+ {
+ "epoch": 3.6359447004608296,
+ "grad_norm": 0.028503524139523506,
+ "learning_rate": 2.5236562181480794e-06,
+ "loss": 0.0001,
+ "step": 265
+ },
+ {
+ "epoch": 3.6497695852534564,
+ "grad_norm": 0.03991785645484924,
+ "learning_rate": 2.5e-06,
+ "loss": 0.0002,
+ "step": 266
+ },
+ {
+ "epoch": 3.6635944700460827,
+ "grad_norm": 0.07638856768608093,
+ "learning_rate": 2.4763437818519205e-06,
+ "loss": 0.0002,
+ "step": 267
+ },
+ {
+ "epoch": 3.6774193548387095,
+ "grad_norm": 0.032387226819992065,
+ "learning_rate": 2.4526896818974534e-06,
+ "loss": 0.0002,
+ "step": 268
+ },
+ {
+ "epoch": 3.6912442396313363,
+ "grad_norm": 0.035975128412246704,
+ "learning_rate": 2.429039818140543e-06,
+ "loss": 0.0002,
+ "step": 269
+ },
+ {
+ "epoch": 3.705069124423963,
+ "grad_norm": 0.021173926070332527,
+ "learning_rate": 2.405396308205825e-06,
+ "loss": 0.0001,
+ "step": 270
+ },
+ {
+ "epoch": 3.71889400921659,
+ "grad_norm": 0.005446314811706543,
+ "learning_rate": 2.381761269149009e-06,
+ "loss": 0.0,
+ "step": 271
+ },
+ {
+ "epoch": 3.7327188940092166,
+ "grad_norm": 0.04019308090209961,
+ "learning_rate": 2.358136817267315e-06,
+ "loss": 0.0001,
+ "step": 272
+ },
+ {
+ "epoch": 3.7465437788018434,
+ "grad_norm": 0.0222685057669878,
+ "learning_rate": 2.334525067909983e-06,
+ "loss": 0.0001,
+ "step": 273
+ },
+ {
+ "epoch": 3.76036866359447,
+ "grad_norm": 0.02486710622906685,
+ "learning_rate": 2.3109281352888593e-06,
+ "loss": 0.0001,
+ "step": 274
+ },
+ {
+ "epoch": 3.774193548387097,
+ "grad_norm": 0.01929207146167755,
+ "learning_rate": 2.2873481322890866e-06,
+ "loss": 0.0001,
+ "step": 275
+ },
+ {
+ "epoch": 3.7880184331797233,
+ "grad_norm": 0.010686581023037434,
+ "learning_rate": 2.263787170279922e-06,
+ "loss": 0.0,
+ "step": 276
+ },
+ {
+ "epoch": 3.80184331797235,
+ "grad_norm": 0.04710806906223297,
+ "learning_rate": 2.2402473589256793e-06,
+ "loss": 0.0001,
+ "step": 277
+ },
+ {
+ "epoch": 3.815668202764977,
+ "grad_norm": 0.00774085009470582,
+ "learning_rate": 2.2167308059968258e-06,
+ "loss": 0.0,
+ "step": 278
+ },
+ {
+ "epoch": 3.8294930875576036,
+ "grad_norm": 0.00735470512881875,
+ "learning_rate": 2.193239617181256e-06,
+ "loss": 0.0,
+ "step": 279
+ },
+ {
+ "epoch": 3.8433179723502304,
+ "grad_norm": 0.005572167690843344,
+ "learning_rate": 2.169775895895745e-06,
+ "loss": 0.0,
+ "step": 280
+ },
+ {
+ "epoch": 3.857142857142857,
+ "grad_norm": 0.07026448100805283,
+ "learning_rate": 2.146341743097604e-06,
+ "loss": 0.0004,
+ "step": 281
+ },
+ {
+ "epoch": 3.870967741935484,
+ "grad_norm": 0.03968067839741707,
+ "learning_rate": 2.1229392570965656e-06,
+ "loss": 0.0001,
+ "step": 282
+ },
+ {
+ "epoch": 3.8847926267281108,
+ "grad_norm": 0.002730958629399538,
+ "learning_rate": 2.0995705333668948e-06,
+ "loss": 0.0,
+ "step": 283
+ },
+ {
+ "epoch": 3.8986175115207375,
+ "grad_norm": 0.010703709907829762,
+ "learning_rate": 2.0762376643597586e-06,
+ "loss": 0.0,
+ "step": 284
+ },
+ {
+ "epoch": 3.912442396313364,
+ "grad_norm": 0.03527766093611717,
+ "learning_rate": 2.0529427393158704e-06,
+ "loss": 0.0001,
+ "step": 285
+ },
+ {
+ "epoch": 3.9262672811059907,
+ "grad_norm": 0.03926033526659012,
+ "learning_rate": 2.0296878440784164e-06,
+ "loss": 0.0001,
+ "step": 286
+ },
+ {
+ "epoch": 3.9400921658986174,
+ "grad_norm": 0.007335775997489691,
+ "learning_rate": 2.006475060906283e-06,
+ "loss": 0.0,
+ "step": 287
+ },
+ {
+ "epoch": 3.953917050691244,
+ "grad_norm": 0.005718631204217672,
+ "learning_rate": 1.9833064682876175e-06,
+ "loss": 0.0,
+ "step": 288
+ },
+ {
+ "epoch": 3.967741935483871,
+ "grad_norm": 0.005941327195614576,
+ "learning_rate": 1.9601841407537157e-06,
+ "loss": 0.0,
+ "step": 289
+ },
+ {
+ "epoch": 3.9815668202764978,
+ "grad_norm": 0.039281055331230164,
+ "learning_rate": 1.937110148693265e-06,
+ "loss": 0.0001,
+ "step": 290
+ },
+ {
+ "epoch": 3.9953917050691246,
+ "grad_norm": 0.06976872682571411,
+ "learning_rate": 1.9140865581669627e-06,
+ "loss": 0.0001,
+ "step": 291
+ },
+ {
+ "epoch": 4.0,
+ "grad_norm": 0.06976872682571411,
+ "learning_rate": 1.8911154307225204e-06,
+ "loss": 0.0,
+ "step": 292
+ },
+ {
+ "epoch": 4.013824884792626,
+ "grad_norm": 0.005908307619392872,
+ "learning_rate": 1.8681988232100674e-06,
+ "loss": 0.0,
+ "step": 293
+ },
+ {
+ "epoch": 4.027649769585254,
+ "grad_norm": 0.022486088797450066,
+ "learning_rate": 1.8453387875979834e-06,
+ "loss": 0.0001,
+ "step": 294
+ },
+ {
+ "epoch": 4.04147465437788,
+ "grad_norm": 0.0074249873869121075,
+ "learning_rate": 1.822537370789163e-06,
+ "loss": 0.0,
+ "step": 295
+ },
+ {
+ "epoch": 4.055299539170507,
+ "grad_norm": 0.004768090322613716,
+ "learning_rate": 1.7997966144377328e-06,
+ "loss": 0.0,
+ "step": 296
+ },
+ {
+ "epoch": 4.0691244239631335,
+ "grad_norm": 0.013053408823907375,
+ "learning_rate": 1.7771185547662417e-06,
+ "loss": 0.0,
+ "step": 297
+ },
+ {
+ "epoch": 4.082949308755761,
+ "grad_norm": 0.00568437110632658,
+ "learning_rate": 1.754505222383337e-06,
+ "loss": 0.0,
+ "step": 298
+ },
+ {
+ "epoch": 4.096774193548387,
+ "grad_norm": 0.006704168394207954,
+ "learning_rate": 1.7319586421019383e-06,
+ "loss": 0.0,
+ "step": 299
+ },
+ {
+ "epoch": 4.110599078341014,
+ "grad_norm": 0.0039120810106396675,
+ "learning_rate": 1.7094808327579401e-06,
+ "loss": 0.0,
+ "step": 300
+ },
+ {
+ "epoch": 4.124423963133641,
+ "grad_norm": 0.009206798858940601,
+ "learning_rate": 1.6870738070294412e-06,
+ "loss": 0.0,
+ "step": 301
+ },
+ {
+ "epoch": 4.138248847926267,
+ "grad_norm": 0.005304583813995123,
+ "learning_rate": 1.6647395712565256e-06,
+ "loss": 0.0,
+ "step": 302
+ },
+ {
+ "epoch": 4.152073732718894,
+ "grad_norm": 0.008103611879050732,
+ "learning_rate": 1.6424801252616186e-06,
+ "loss": 0.0001,
+ "step": 303
+ },
+ {
+ "epoch": 4.1658986175115205,
+ "grad_norm": 0.028891608119010925,
+ "learning_rate": 1.6202974621704176e-06,
+ "loss": 0.0,
+ "step": 304
+ },
+ {
+ "epoch": 4.179723502304148,
+ "grad_norm": 0.0035763406194746494,
+ "learning_rate": 1.5981935682334266e-06,
+ "loss": 0.0,
+ "step": 305
+ },
+ {
+ "epoch": 4.193548387096774,
+ "grad_norm": 0.009718772955238819,
+ "learning_rate": 1.5761704226481078e-06,
+ "loss": 0.0,
+ "step": 306
+ },
+ {
+ "epoch": 4.207373271889401,
+ "grad_norm": 0.01045698020607233,
+ "learning_rate": 1.5542299973816626e-06,
+ "loss": 0.0,
+ "step": 307
+ },
+ {
+ "epoch": 4.221198156682028,
+ "grad_norm": 0.004575685132294893,
+ "learning_rate": 1.5323742569944573e-06,
+ "loss": 0.0,
+ "step": 308
+ },
+ {
+ "epoch": 4.235023041474655,
+ "grad_norm": 0.003245371161028743,
+ "learning_rate": 1.5106051584641208e-06,
+ "loss": 0.0,
+ "step": 309
+ },
+ {
+ "epoch": 4.248847926267281,
+ "grad_norm": 0.005619620904326439,
+ "learning_rate": 1.4889246510103078e-06,
+ "loss": 0.0,
+ "step": 310
+ },
+ {
+ "epoch": 4.2626728110599075,
+ "grad_norm": 0.004715710878372192,
+ "learning_rate": 1.4673346759201728e-06,
+ "loss": 0.0,
+ "step": 311
+ },
+ {
+ "epoch": 4.276497695852535,
+ "grad_norm": 0.007476332131773233,
+ "learning_rate": 1.44583716637454e-06,
+ "loss": 0.0,
+ "step": 312
+ },
+ {
+ "epoch": 4.290322580645161,
+ "grad_norm": 0.01739400625228882,
+ "learning_rate": 1.4244340472748076e-06,
+ "loss": 0.0001,
+ "step": 313
+ },
+ {
+ "epoch": 4.304147465437788,
+ "grad_norm": 0.00816753227263689,
+ "learning_rate": 1.403127235070587e-06,
+ "loss": 0.0,
+ "step": 314
+ },
+ {
+ "epoch": 4.317972350230415,
+ "grad_norm": 0.010216044262051582,
+ "learning_rate": 1.381918637588112e-06,
+ "loss": 0.0,
+ "step": 315
+ },
+ {
+ "epoch": 4.331797235023042,
+ "grad_norm": 0.004990486893802881,
+ "learning_rate": 1.3608101538593965e-06,
+ "loss": 0.0,
+ "step": 316
+ },
+ {
+ "epoch": 4.345622119815668,
+ "grad_norm": 0.004758649505674839,
+ "learning_rate": 1.3398036739522088e-06,
+ "loss": 0.0001,
+ "step": 317
+ },
+ {
+ "epoch": 4.359447004608295,
+ "grad_norm": 0.041808340698480606,
+ "learning_rate": 1.3189010788008234e-06,
+ "loss": 0.0,
+ "step": 318
+ },
+ {
+ "epoch": 4.373271889400922,
+ "grad_norm": 0.012711254879832268,
+ "learning_rate": 1.2981042400376032e-06,
+ "loss": 0.0,
+ "step": 319
+ },
+ {
+ "epoch": 4.387096774193548,
+ "grad_norm": 0.0035697701387107372,
+ "learning_rate": 1.277415019825417e-06,
+ "loss": 0.0,
+ "step": 320
+ },
+ {
+ "epoch": 4.400921658986175,
+ "grad_norm": 0.005487007088959217,
+ "learning_rate": 1.2568352706908937e-06,
+ "loss": 0.0,
+ "step": 321
+ },
+ {
+ "epoch": 4.414746543778802,
+ "grad_norm": 0.01304635126143694,
+ "learning_rate": 1.2363668353585486e-06,
+ "loss": 0.0,
+ "step": 322
+ },
+ {
+ "epoch": 4.428571428571429,
+ "grad_norm": 0.0019787494093179703,
+ "learning_rate": 1.216011546585785e-06,
+ "loss": 0.0,
+ "step": 323
+ },
+ {
+ "epoch": 4.442396313364055,
+ "grad_norm": 0.00808583851903677,
+ "learning_rate": 1.195771226998789e-06,
+ "loss": 0.0,
+ "step": 324
+ },
+ {
+ "epoch": 4.456221198156682,
+ "grad_norm": 0.0022094689775258303,
+ "learning_rate": 1.1756476889293269e-06,
+ "loss": 0.0,
+ "step": 325
+ },
+ {
+ "epoch": 4.470046082949309,
+ "grad_norm": 0.012792594730854034,
+ "learning_rate": 1.1556427342524698e-06,
+ "loss": 0.0,
+ "step": 326
+ },
+ {
+ "epoch": 4.483870967741936,
+ "grad_norm": 0.006805351935327053,
+ "learning_rate": 1.1357581542252555e-06,
+ "loss": 0.0,
+ "step": 327
+ },
+ {
+ "epoch": 4.497695852534562,
+ "grad_norm": 0.003740285988897085,
+ "learning_rate": 1.1159957293262888e-06,
+ "loss": 0.0,
+ "step": 328
+ },
+ {
+ "epoch": 4.511520737327189,
+ "grad_norm": 0.009705561213195324,
+ "learning_rate": 1.0963572290963298e-06,
+ "loss": 0.0,
+ "step": 329
+ },
+ {
+ "epoch": 4.525345622119816,
+ "grad_norm": 0.040002401918172836,
+ "learning_rate": 1.0768444119798357e-06,
+ "loss": 0.0002,
+ "step": 330
+ },
+ {
+ "epoch": 4.539170506912442,
+ "grad_norm": 0.0036789420992136,
+ "learning_rate": 1.0574590251675145e-06,
+ "loss": 0.0,
+ "step": 331
+ },
+ {
+ "epoch": 4.552995391705069,
+ "grad_norm": 0.004043960478156805,
+ "learning_rate": 1.0382028044398823e-06,
+ "loss": 0.0002,
+ "step": 332
+ },
+ {
+ "epoch": 4.566820276497696,
+ "grad_norm": 0.0512581467628479,
+ "learning_rate": 1.0190774740118343e-06,
+ "loss": 0.0,
+ "step": 333
+ },
+ {
+ "epoch": 4.580645161290323,
+ "grad_norm": 0.004926969762891531,
+ "learning_rate": 1.0000847463782615e-06,
+ "loss": 0.0,
+ "step": 334
+ },
+ {
+ "epoch": 4.594470046082949,
+ "grad_norm": 0.0043294900096952915,
+ "learning_rate": 9.812263221607114e-07,
+ "loss": 0.0,
+ "step": 335
+ },
+ {
+ "epoch": 4.6082949308755765,
+ "grad_norm": 0.0023195091634988785,
+ "learning_rate": 9.625038899551162e-07,
+ "loss": 0.0,
+ "step": 336
+ },
+ {
+ "epoch": 4.622119815668203,
+ "grad_norm": 0.0015059575671330094,
+ "learning_rate": 9.439191261805894e-07,
+ "loss": 0.0,
+ "step": 337
+ },
+ {
+ "epoch": 4.635944700460829,
+ "grad_norm": 0.001368862227536738,
+ "learning_rate": 9.254736949293216e-07,
+ "loss": 0.0,
+ "step": 338
+ },
+ {
+ "epoch": 4.649769585253456,
+ "grad_norm": 0.008128674700856209,
+ "learning_rate": 9.07169247817579e-07,
+ "loss": 0.0,
+ "step": 339
+ },
+ {
+ "epoch": 4.663594470046083,
+ "grad_norm": 0.0029226879123598337,
+ "learning_rate": 8.890074238378074e-07,
+ "loss": 0.0,
+ "step": 340
+ },
+ {
+ "epoch": 4.67741935483871,
+ "grad_norm": 0.0012331035686656833,
+ "learning_rate": 8.709898492118885e-07,
+ "loss": 0.0,
+ "step": 341
+ },
+ {
+ "epoch": 4.691244239631336,
+ "grad_norm": 0.005286338273435831,
+ "learning_rate": 8.531181372455161e-07,
+ "loss": 0.0,
+ "step": 342
+ },
+ {
+ "epoch": 4.705069124423963,
+ "grad_norm": 0.0026836844626814127,
+ "learning_rate": 8.353938881837445e-07,
+ "loss": 0.0,
+ "step": 343
+ },
+ {
+ "epoch": 4.71889400921659,
+ "grad_norm": 0.013100259937345982,
+ "learning_rate": 8.178186890677029e-07,
+ "loss": 0.0,
+ "step": 344
+ },
+ {
+ "epoch": 4.732718894009217,
+ "grad_norm": 0.005650435108691454,
+ "learning_rate": 8.003941135924859e-07,
+ "loss": 0.0,
+ "step": 345
+ },
+ {
+ "epoch": 4.746543778801843,
+ "grad_norm": 0.007480297237634659,
+ "learning_rate": 7.83121721966248e-07,
+ "loss": 0.0,
+ "step": 346
+ },
+ {
+ "epoch": 4.76036866359447,
+ "grad_norm": 0.014115474186837673,
+ "learning_rate": 7.66003060770498e-07,
+ "loss": 0.0,
+ "step": 347
+ },
+ {
+ "epoch": 4.774193548387097,
+ "grad_norm": 0.0011564996093511581,
+ "learning_rate": 7.490396628216237e-07,
+ "loss": 0.0,
+ "step": 348
+ },
+ {
+ "epoch": 4.788018433179723,
+ "grad_norm": 0.01101834885776043,
+ "learning_rate": 7.322330470336314e-07,
+ "loss": 0.0,
+ "step": 349
+ },
+ {
+ "epoch": 4.8018433179723505,
+ "grad_norm": 0.003535608062520623,
+ "learning_rate": 7.155847182821524e-07,
+ "loss": 0.0,
+ "step": 350
+ },
+ {
+ "epoch": 4.815668202764977,
+ "grad_norm": 0.03185940906405449,
+ "learning_rate": 6.990961672696908e-07,
+ "loss": 0.0001,
+ "step": 351
+ },
+ {
+ "epoch": 4.829493087557603,
+ "grad_norm": 0.006516721565276384,
+ "learning_rate": 6.827688703921407e-07,
+ "loss": 0.0,
+ "step": 352
+ },
+ {
+ "epoch": 4.84331797235023,
+ "grad_norm": 0.008277276530861855,
+ "learning_rate": 6.666042896065983e-07,
+ "loss": 0.0,
+ "step": 353
+ },
+ {
+ "epoch": 4.857142857142857,
+ "grad_norm": 0.00266360049135983,
+ "learning_rate": 6.506038723004484e-07,
+ "loss": 0.0,
+ "step": 354
+ },
+ {
+ "epoch": 4.870967741935484,
+ "grad_norm": 0.01671386882662773,
+ "learning_rate": 6.347690511617693e-07,
+ "loss": 0.0,
+ "step": 355
+ },
+ {
+ "epoch": 4.88479262672811,
+ "grad_norm": 0.013981528580188751,
+ "learning_rate": 6.191012440510469e-07,
+ "loss": 0.0001,
+ "step": 356
+ },
+ {
+ "epoch": 4.8986175115207375,
+ "grad_norm": 0.02350999414920807,
+ "learning_rate": 6.036018538742208e-07,
+ "loss": 0.0,
+ "step": 357
+ },
+ {
+ "epoch": 4.912442396313364,
+ "grad_norm": 0.004093356896191835,
+ "learning_rate": 5.882722684570638e-07,
+ "loss": 0.0,
+ "step": 358
+ },
+ {
+ "epoch": 4.926267281105991,
+ "grad_norm": 0.00656296918168664,
+ "learning_rate": 5.731138604209169e-07,
+ "loss": 0.0,
+ "step": 359
+ },
+ {
+ "epoch": 4.940092165898617,
+ "grad_norm": 0.002148544415831566,
+ "learning_rate": 5.581279870597866e-07,
+ "loss": 0.0,
+ "step": 360
+ },
+ {
+ "epoch": 4.953917050691244,
+ "grad_norm": 0.001741332234814763,
+ "learning_rate": 5.433159902188043e-07,
+ "loss": 0.0,
+ "step": 361
+ },
+ {
+ "epoch": 4.967741935483871,
+ "grad_norm": 0.0027741482481360435,
+ "learning_rate": 5.286791961740855e-07,
+ "loss": 0.0,
+ "step": 362
+ },
+ {
+ "epoch": 4.981566820276497,
+ "grad_norm": 0.016398902982473373,
+ "learning_rate": 5.142189155139685e-07,
+ "loss": 0.0,
+ "step": 363
+ },
+ {
+ "epoch": 4.9953917050691246,
+ "grad_norm": 0.0017799193738028407,
+ "learning_rate": 4.999364430216639e-07,
+ "loss": 0.0,
+ "step": 364
+ },
+ {
+ "epoch": 5.0,
+ "grad_norm": 0.0017799193738028407,
+ "learning_rate": 4.85833057559322e-07,
+ "loss": 0.0,
+ "step": 365
+ },
+ {
+ "epoch": 5.013824884792626,
+ "grad_norm": 0.020532581955194473,
+ "learning_rate": 4.719100219535194e-07,
+ "loss": 0.0,
+ "step": 366
+ },
+ {
+ "epoch": 5.027649769585254,
+ "grad_norm": 0.002060740254819393,
+ "learning_rate": 4.581685828821858e-07,
+ "loss": 0.0,
+ "step": 367
+ },
+ {
+ "epoch": 5.04147465437788,
+ "grad_norm": 0.005803892854601145,
+ "learning_rate": 4.4460997076297504e-07,
+ "loss": 0.0001,
+ "step": 368
+ },
+ {
+ "epoch": 5.055299539170507,
+ "grad_norm": 0.02043106034398079,
+ "learning_rate": 4.3123539964309486e-07,
+ "loss": 0.0,
+ "step": 369
+ },
+ {
+ "epoch": 5.0691244239631335,
+ "grad_norm": 0.002218640176579356,
+ "learning_rate": 4.180460670905978e-07,
+ "loss": 0.0,
+ "step": 370
+ },
+ {
+ "epoch": 5.082949308755761,
+ "grad_norm": 0.002037397352978587,
+ "learning_rate": 4.0504315408714993e-07,
+ "loss": 0.0,
+ "step": 371
+ },
+ {
+ "epoch": 5.096774193548387,
+ "grad_norm": 0.0042083426378667355,
+ "learning_rate": 3.922278249222894e-07,
+ "loss": 0.0,
+ "step": 372
+ },
+ {
+ "epoch": 5.110599078341014,
+ "grad_norm": 0.0014362820656970143,
+ "learning_rate": 3.796012270891672e-07,
+ "loss": 0.0001,
+ "step": 373
+ },
+ {
+ "epoch": 5.124423963133641,
+ "grad_norm": 0.0357951857149601,
+ "learning_rate": 3.671644911818084e-07,
+ "loss": 0.0,
+ "step": 374
+ },
+ {
+ "epoch": 5.138248847926267,
+ "grad_norm": 0.0089655676856637,
+ "learning_rate": 3.549187307938726e-07,
+ "loss": 0.0,
+ "step": 375
+ },
+ {
+ "epoch": 5.152073732718894,
+ "grad_norm": 0.0018362803384661674,
+ "learning_rate": 3.4286504241894283e-07,
+ "loss": 0.0,
+ "step": 376
+ },
+ {
+ "epoch": 5.1658986175115205,
+ "grad_norm": 0.00128362572286278,
+ "learning_rate": 3.310045053523475e-07,
+ "loss": 0.0,
+ "step": 377
+ },
+ {
+ "epoch": 5.179723502304148,
+ "grad_norm": 0.0027053402736783028,
+ "learning_rate": 3.1933818159451566e-07,
+ "loss": 0.0,
+ "step": 378
+ },
+ {
+ "epoch": 5.193548387096774,
+ "grad_norm": 0.0018623315263539553,
+ "learning_rate": 3.078671157558877e-07,
+ "loss": 0.0,
+ "step": 379
+ },
+ {
+ "epoch": 5.207373271889401,
+ "grad_norm": 0.0021778331138193607,
+ "learning_rate": 2.965923349633779e-07,
+ "loss": 0.0,
+ "step": 380
+ },
+ {
+ "epoch": 5.221198156682028,
+ "grad_norm": 0.003248832654207945,
+ "learning_rate": 2.8551484876840815e-07,
+ "loss": 0.0,
+ "step": 381
+ },
+ {
+ "epoch": 5.235023041474655,
+ "grad_norm": 0.0036160217132419348,
+ "learning_rate": 2.7463564905650855e-07,
+ "loss": 0.0,
+ "step": 382
+ },
+ {
+ "epoch": 5.248847926267281,
+ "grad_norm": 0.002592155709862709,
+ "learning_rate": 2.639557099585047e-07,
+ "loss": 0.0,
+ "step": 383
+ },
+ {
+ "epoch": 5.2626728110599075,
+ "grad_norm": 0.002448409330099821,
+ "learning_rate": 2.53475987763295e-07,
+ "loss": 0.0,
+ "step": 384
+ },
+ {
+ "epoch": 5.276497695852535,
+ "grad_norm": 0.0013525751419365406,
+ "learning_rate": 2.431974208322191e-07,
+ "loss": 0.0,
+ "step": 385
+ },
+ {
+ "epoch": 5.290322580645161,
+ "grad_norm": 0.0022237959783524275,
+ "learning_rate": 2.3312092951504357e-07,
+ "loss": 0.0,
+ "step": 386
+ },
+ {
+ "epoch": 5.304147465437788,
+ "grad_norm": 0.0074063981883227825,
+ "learning_rate": 2.2324741606754629e-07,
+ "loss": 0.0,
+ "step": 387
+ },
+ {
+ "epoch": 5.317972350230415,
+ "grad_norm": 0.0030363211408257484,
+ "learning_rate": 2.135777645707318e-07,
+ "loss": 0.0,
+ "step": 388
+ },
+ {
+ "epoch": 5.331797235023042,
+ "grad_norm": 0.0018678180640563369,
+ "learning_rate": 2.041128408516696e-07,
+ "loss": 0.0,
+ "step": 389
+ },
+ {
+ "epoch": 5.345622119815668,
+ "grad_norm": 0.008805639110505581,
+ "learning_rate": 1.9485349240596613e-07,
+ "loss": 0.0,
+ "step": 390
+ },
+ {
+ "epoch": 5.359447004608295,
+ "grad_norm": 0.0022748217452317476,
+ "learning_rate": 1.8580054832188055e-07,
+ "loss": 0.0,
+ "step": 391
+ },
+ {
+ "epoch": 5.373271889400922,
+ "grad_norm": 0.004513687454164028,
+ "learning_rate": 1.7695481920608716e-07,
+ "loss": 0.0,
+ "step": 392
+ },
+ {
+ "epoch": 5.387096774193548,
+ "grad_norm": 0.004760707262903452,
+ "learning_rate": 1.683170971110934e-07,
+ "loss": 0.0,
+ "step": 393
+ },
+ {
+ "epoch": 5.400921658986175,
+ "grad_norm": 0.0018107573268935084,
+ "learning_rate": 1.5988815546431807e-07,
+ "loss": 0.0,
+ "step": 394
+ },
+ {
+ "epoch": 5.414746543778802,
+ "grad_norm": 0.0031882438343018293,
+ "learning_rate": 1.5166874899884054e-07,
+ "loss": 0.0,
+ "step": 395
+ },
+ {
+ "epoch": 5.428571428571429,
+ "grad_norm": 0.0029730822425335646,
+ "learning_rate": 1.4365961368581844e-07,
+ "loss": 0.0,
+ "step": 396
+ },
+ {
+ "epoch": 5.442396313364055,
+ "grad_norm": 0.01603095605969429,
+ "learning_rate": 1.3586146666858923e-07,
+ "loss": 0.0,
+ "step": 397
+ },
+ {
+ "epoch": 5.456221198156682,
+ "grad_norm": 0.0031079358886927366,
+ "learning_rate": 1.2827500619845918e-07,
+ "loss": 0.0,
+ "step": 398
+ },
+ {
+ "epoch": 5.470046082949309,
+ "grad_norm": 0.005407292395830154,
+ "learning_rate": 1.2090091157217653e-07,
+ "loss": 0.0,
+ "step": 399
+ },
+ {
+ "epoch": 5.483870967741936,
+ "grad_norm": 0.004690848756581545,
+ "learning_rate": 1.137398430711123e-07,
+ "loss": 0.0,
+ "step": 400
+ },
+ {
+ "epoch": 5.497695852534562,
+ "grad_norm": 0.008627370931208134,
+ "learning_rate": 1.0679244190213378e-07,
+ "loss": 0.0,
+ "step": 401
+ },
+ {
+ "epoch": 5.511520737327189,
+ "grad_norm": 0.0011705803917720914,
+ "learning_rate": 1.0005933014019309e-07,
+ "loss": 0.0,
+ "step": 402
+ },
+ {
+ "epoch": 5.525345622119816,
+ "grad_norm": 0.0012002491857856512,
+ "learning_rate": 9.354111067262584e-08,
+ "loss": 0.0,
+ "step": 403
+ },
+ {
+ "epoch": 5.539170506912442,
+ "grad_norm": 0.003351973369717598,
+ "learning_rate": 8.723836714516681e-08,
+ "loss": 0.0,
+ "step": 404
+ },
+ {
+ "epoch": 5.552995391705069,
+ "grad_norm": 0.0029097534716129303,
+ "learning_rate": 8.115166390969126e-08,
+ "loss": 0.0,
+ "step": 405
+ },
+ {
+ "epoch": 5.566820276497696,
+ "grad_norm": 0.006721805315464735,
+ "learning_rate": 7.528154597368192e-08,
+ "loss": 0.0,
+ "step": 406
+ },
+ {
+ "epoch": 5.580645161290323,
+ "grad_norm": 0.002376762218773365,
+ "learning_rate": 6.962853895142924e-08,
+ "loss": 0.0,
+ "step": 407
+ },
+ {
+ "epoch": 5.594470046082949,
+ "grad_norm": 0.0010407604277133942,
+ "learning_rate": 6.419314901696671e-08,
+ "loss": 0.0,
+ "step": 408
+ },
+ {
+ "epoch": 5.6082949308755765,
+ "grad_norm": 0.004894618410617113,
+ "learning_rate": 5.897586285874751e-08,
+ "loss": 0.0,
+ "step": 409
+ },
+ {
+ "epoch": 5.622119815668203,
+ "grad_norm": 0.003351738443598151,
+ "learning_rate": 5.3977147636068425e-08,
+ "loss": 0.0,
+ "step": 410
+ },
+ {
+ "epoch": 5.635944700460829,
+ "grad_norm": 0.008450047113001347,
+ "learning_rate": 4.919745093723588e-08,
+ "loss": 0.0,
+ "step": 411
+ },
+ {
+ "epoch": 5.649769585253456,
+ "grad_norm": 0.005739122163504362,
+ "learning_rate": 4.4637200739493514e-08,
+ "loss": 0.0,
+ "step": 412
+ },
+ {
+ "epoch": 5.663594470046083,
+ "grad_norm": 0.008892405778169632,
+ "learning_rate": 4.0296805370696466e-08,
+ "loss": 0.0,
+ "step": 413
+ },
+ {
+ "epoch": 5.67741935483871,
+ "grad_norm": 0.004593739286065102,
+ "learning_rate": 3.617665347275201e-08,
+ "loss": 0.0,
+ "step": 414
+ },
+ {
+ "epoch": 5.691244239631336,
+ "grad_norm": 0.0009952496038749814,
+ "learning_rate": 3.227711396682015e-08,
+ "loss": 0.0,
+ "step": 415
+ },
+ {
+ "epoch": 5.705069124423963,
+ "grad_norm": 0.021882547065615654,
+ "learning_rate": 2.8598536020278678e-08,
+ "loss": 0.0001,
+ "step": 416
+ },
+ {
+ "epoch": 5.71889400921659,
+ "grad_norm": 0.0015494381077587605,
+ "learning_rate": 2.5141249015459833e-08,
+ "loss": 0.0,
+ "step": 417
+ },
+ {
+ "epoch": 5.732718894009217,
+ "grad_norm": 0.0045528411865234375,
+ "learning_rate": 2.1905562520156686e-08,
+ "loss": 0.0,
+ "step": 418
+ },
+ {
+ "epoch": 5.746543778801843,
+ "grad_norm": 0.0061921789310872555,
+ "learning_rate": 1.8891766259904188e-08,
+ "loss": 0.0,
+ "step": 419
+ },
+ {
+ "epoch": 5.76036866359447,
+ "grad_norm": 0.0012759686214849353,
+ "learning_rate": 1.6100130092037704e-08,
+ "loss": 0.0,
+ "step": 420
+ },
+ {
+ "epoch": 5.774193548387097,
+ "grad_norm": 0.002300928346812725,
+ "learning_rate": 1.3530903981528454e-08,
+ "loss": 0.0,
+ "step": 421
+ },
+ {
+ "epoch": 5.788018433179723,
+ "grad_norm": 0.00588129460811615,
+ "learning_rate": 1.118431797860281e-08,
+ "loss": 0.0,
+ "step": 422
+ },
+ {
+ "epoch": 5.8018433179723505,
+ "grad_norm": 0.002937384182587266,
+ "learning_rate": 9.060582198141822e-09,
+ "loss": 0.0,
+ "step": 423
+ },
+ {
+ "epoch": 5.815668202764977,
+ "grad_norm": 0.0028133601881563663,
+ "learning_rate": 7.159886800869875e-09,
+ "loss": 0.0,
+ "step": 424
+ },
+ {
+ "epoch": 5.829493087557603,
+ "grad_norm": 0.0039275335147976875,
+ "learning_rate": 5.482401976325269e-09,
+ "loss": 0.0,
+ "step": 425
+ },
+ {
+ "epoch": 5.84331797235023,
+ "grad_norm": 0.0026619117707014084,
+ "learning_rate": 4.028277927621838e-09,
+ "loss": 0.0,
+ "step": 426
+ },
+ {
+ "epoch": 5.857142857142857,
+ "grad_norm": 0.0010664263973012567,
+ "learning_rate": 2.7976448580008252e-09,
+ "loss": 0.0,
+ "step": 427
+ },
+ {
+ "epoch": 5.870967741935484,
+ "grad_norm": 0.0023162406869232655,
+ "learning_rate": 1.7906129591713228e-09,
+ "loss": 0.0,
+ "step": 428
+ },
+ {
+ "epoch": 5.88479262672811,
+ "grad_norm": 0.001858705305494368,
+ "learning_rate": 1.0072724014437153e-09,
+ "loss": 0.0,
+ "step": 429
+ },
+ {
+ "epoch": 5.8986175115207375,
+ "grad_norm": 0.005602267105132341,
+ "learning_rate": 4.476933256555849e-10,
+ "loss": 0.0,
+ "step": 430
+ },
+ {
+ "epoch": 5.912442396313364,
+ "grad_norm": 0.005617993883788586,
+ "learning_rate": 1.1192583689256797e-10,
+ "loss": 0.0,
+ "step": 431
+ },
+ {
+ "epoch": 5.926267281105991,
+ "grad_norm": 0.002151880646124482,
+ "learning_rate": 0.0,
+ "loss": 0.0,
+ "step": 432
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 432,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 72,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": true
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 2.7576912270814347e+19,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-432/training_args.bin b/checkpoint-432/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..48c70fd554062e31c1333fa196fcbd6a4f178c6c
--- /dev/null
+++ b/checkpoint-432/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1d334ff15891d240486e54641859af8de96cab43a64ef7bf9dc417387365ae5
+size 7928
diff --git a/checkpoint-432/zero_to_fp32.py b/checkpoint-432/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-432/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/checkpoint-72/README.md b/checkpoint-72/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fdf619c317c2fe82074662582dbd68166b6f9d50
--- /dev/null
+++ b/checkpoint-72/README.md
@@ -0,0 +1,202 @@
+---
+base_model: nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
+library_name: peft
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.0
\ No newline at end of file
diff --git a/checkpoint-72/adapter_config.json b/checkpoint-72/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3abb5d68d20446d2b99ace226d6233a68590205a
--- /dev/null
+++ b/checkpoint-72/adapter_config.json
@@ -0,0 +1,42 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 512,
+ "lora_bias": false,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "peft_type": "LORA",
+ "r": 256,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "o_proj",
+ "up_proj",
+ "down_proj",
+ "gate_proj",
+ "v_proj",
+ "k_proj",
+ "q_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-72/adapter_model.safetensors b/checkpoint-72/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a7a4f286a0cb5cf8564aa41469d1ff5bff5ba57c
--- /dev/null
+++ b/checkpoint-72/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e31fc3dc2e7df1a2e917cf03e227c421a0435c3cefcd72109ad7ee3e4967bf69
+size 10829849744
diff --git a/checkpoint-72/global_step72/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/checkpoint-72/global_step72/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..203d596137371769dfad3c5a5bc3f69cb26f6ef5
--- /dev/null
+++ b/checkpoint-72/global_step72/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dfc9940d26ef68be806edd3cba19af70f2e714b804ba4dfd9d9be57e670d0615
+size 21659418140
diff --git a/checkpoint-72/global_step72/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/checkpoint-72/global_step72/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..dc7642b28f39ff8b5c72416cc36d378e979b4006
--- /dev/null
+++ b/checkpoint-72/global_step72/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0abfd6850435d66c85274bf28f747ce56574eee8dc3405ac223a459f279ac69b
+size 21659457372
diff --git a/checkpoint-72/global_step72/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/checkpoint-72/global_step72/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b3e99c0e17a52bc4664bd1573d0534b942c5001e
--- /dev/null
+++ b/checkpoint-72/global_step72/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e933640492efca8c6fb86684cb2c3fe4e107eb06fd3575b30c5207110ad4c728
+size 21659417820
diff --git a/checkpoint-72/global_step72/mp_rank_00_model_states.pt b/checkpoint-72/global_step72/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a71b3daa308c6eec348965d2f5f09c6b033d0d26
--- /dev/null
+++ b/checkpoint-72/global_step72/mp_rank_00_model_states.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d8f3f6a325d39cde97c822e808f67697583ba67cfc5477af338d36378dd320d
+size 11918643933
diff --git a/checkpoint-72/latest b/checkpoint-72/latest
new file mode 100644
index 0000000000000000000000000000000000000000..f3ff0f3ef57eac4f36c543b2d7ef78ca727041bd
--- /dev/null
+++ b/checkpoint-72/latest
@@ -0,0 +1 @@
+global_step72
\ No newline at end of file
diff --git a/checkpoint-72/rng_state_0.pth b/checkpoint-72/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6e03436dd77f0f742b73e3f601a58d05364ee48b
--- /dev/null
+++ b/checkpoint-72/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:541a247a0499946942d469144d4609ab54f406a01327defecf24e55cce3eaaff
+size 14768
diff --git a/checkpoint-72/rng_state_1.pth b/checkpoint-72/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0d2065fa0d9a503d409eaed77bd3dafcec8c6e51
--- /dev/null
+++ b/checkpoint-72/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55f595d3bc4cf74ef1c4bf07834b2d3c1153e4c96ec66ee50cd533cd68d3f2be
+size 14768
diff --git a/checkpoint-72/rng_state_2.pth b/checkpoint-72/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b59ecfae7f98fa951c562700fd917c39af7c9ffe
--- /dev/null
+++ b/checkpoint-72/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a141ba5106d9cb0d6d4ea1db081a08d8a6182e2ca548def74038dc2ab25e5894
+size 14768
diff --git a/checkpoint-72/scheduler.pt b/checkpoint-72/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..324d6e5ba59dc80aafdba02fe2ebc9eac737c54b
--- /dev/null
+++ b/checkpoint-72/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14d3ad851fc136efe822990f8b99840e98b2ff20804944bcf122f2cafb45ed1f
+size 1064
diff --git a/checkpoint-72/special_tokens_map.json b/checkpoint-72/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..278b7f0f84be865c4687700ee7b3c63d89a51e18
--- /dev/null
+++ b/checkpoint-72/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-72/tokenizer.json b/checkpoint-72/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2
--- /dev/null
+++ b/checkpoint-72/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
diff --git a/checkpoint-72/tokenizer_config.json b/checkpoint-72/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..fdde94c29816839ec3c29d6c6461206a49911f3c
--- /dev/null
+++ b/checkpoint-72/tokenizer_config.json
@@ -0,0 +1,2064 @@
+{
+ "added_tokens_decoder": {
+ "128000": {
+ "content": "<|begin_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128001": {
+ "content": "<|end_of_text|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128002": {
+ "content": "<|reserved_special_token_0|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128003": {
+ "content": "<|reserved_special_token_1|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128004": {
+ "content": "<|finetune_right_pad_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128005": {
+ "content": "<|reserved_special_token_2|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128006": {
+ "content": "<|start_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128007": {
+ "content": "<|end_header_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128008": {
+ "content": "<|eom_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128009": {
+ "content": "<|eot_id|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128010": {
+ "content": "<|python_tag|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128011": {
+ "content": "<|reserved_special_token_3|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128012": {
+ "content": "<|reserved_special_token_4|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128013": {
+ "content": "<|reserved_special_token_5|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128014": {
+ "content": "<|reserved_special_token_6|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128015": {
+ "content": "<|reserved_special_token_7|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128016": {
+ "content": "<|reserved_special_token_8|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128017": {
+ "content": "<|reserved_special_token_9|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128018": {
+ "content": "<|reserved_special_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128019": {
+ "content": "<|reserved_special_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128020": {
+ "content": "<|reserved_special_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128021": {
+ "content": "<|reserved_special_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128022": {
+ "content": "<|reserved_special_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128023": {
+ "content": "<|reserved_special_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128024": {
+ "content": "<|reserved_special_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128025": {
+ "content": "<|reserved_special_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128026": {
+ "content": "<|reserved_special_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128027": {
+ "content": "<|reserved_special_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128028": {
+ "content": "<|reserved_special_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128029": {
+ "content": "<|reserved_special_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128030": {
+ "content": "<|reserved_special_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128031": {
+ "content": "<|reserved_special_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128032": {
+ "content": "<|reserved_special_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128033": {
+ "content": "<|reserved_special_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128034": {
+ "content": "<|reserved_special_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128035": {
+ "content": "<|reserved_special_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128036": {
+ "content": "<|reserved_special_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128037": {
+ "content": "<|reserved_special_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128038": {
+ "content": "<|reserved_special_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128039": {
+ "content": "<|reserved_special_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128040": {
+ "content": "<|reserved_special_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128041": {
+ "content": "<|reserved_special_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128042": {
+ "content": "<|reserved_special_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128043": {
+ "content": "<|reserved_special_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128044": {
+ "content": "<|reserved_special_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128045": {
+ "content": "<|reserved_special_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128046": {
+ "content": "<|reserved_special_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128047": {
+ "content": "<|reserved_special_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128048": {
+ "content": "<|reserved_special_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128049": {
+ "content": "<|reserved_special_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128050": {
+ "content": "<|reserved_special_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128051": {
+ "content": "<|reserved_special_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128052": {
+ "content": "<|reserved_special_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128053": {
+ "content": "<|reserved_special_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128054": {
+ "content": "<|reserved_special_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128055": {
+ "content": "<|reserved_special_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128056": {
+ "content": "<|reserved_special_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128057": {
+ "content": "<|reserved_special_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128058": {
+ "content": "<|reserved_special_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128059": {
+ "content": "<|reserved_special_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128060": {
+ "content": "<|reserved_special_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128061": {
+ "content": "<|reserved_special_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128062": {
+ "content": "<|reserved_special_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128063": {
+ "content": "<|reserved_special_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128064": {
+ "content": "<|reserved_special_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128065": {
+ "content": "<|reserved_special_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128066": {
+ "content": "<|reserved_special_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128067": {
+ "content": "<|reserved_special_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128068": {
+ "content": "<|reserved_special_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128069": {
+ "content": "<|reserved_special_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128070": {
+ "content": "<|reserved_special_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128071": {
+ "content": "<|reserved_special_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128072": {
+ "content": "<|reserved_special_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128073": {
+ "content": "<|reserved_special_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128074": {
+ "content": "<|reserved_special_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128075": {
+ "content": "<|reserved_special_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128076": {
+ "content": "<|reserved_special_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128077": {
+ "content": "<|reserved_special_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128078": {
+ "content": "<|reserved_special_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128079": {
+ "content": "<|reserved_special_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128080": {
+ "content": "<|reserved_special_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128081": {
+ "content": "<|reserved_special_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128082": {
+ "content": "<|reserved_special_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128083": {
+ "content": "<|reserved_special_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128084": {
+ "content": "<|reserved_special_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128085": {
+ "content": "<|reserved_special_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128086": {
+ "content": "<|reserved_special_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128087": {
+ "content": "<|reserved_special_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128088": {
+ "content": "<|reserved_special_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128089": {
+ "content": "<|reserved_special_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128090": {
+ "content": "<|reserved_special_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128091": {
+ "content": "<|reserved_special_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128092": {
+ "content": "<|reserved_special_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128093": {
+ "content": "<|reserved_special_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128094": {
+ "content": "<|reserved_special_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128095": {
+ "content": "<|reserved_special_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128096": {
+ "content": "<|reserved_special_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128097": {
+ "content": "<|reserved_special_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128098": {
+ "content": "<|reserved_special_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128099": {
+ "content": "<|reserved_special_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128100": {
+ "content": "<|reserved_special_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128101": {
+ "content": "<|reserved_special_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128102": {
+ "content": "<|reserved_special_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128103": {
+ "content": "<|reserved_special_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128104": {
+ "content": "<|reserved_special_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128105": {
+ "content": "<|reserved_special_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128106": {
+ "content": "<|reserved_special_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128107": {
+ "content": "<|reserved_special_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128108": {
+ "content": "<|reserved_special_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128109": {
+ "content": "<|reserved_special_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128110": {
+ "content": "<|reserved_special_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128111": {
+ "content": "<|reserved_special_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128112": {
+ "content": "<|reserved_special_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128113": {
+ "content": "<|reserved_special_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128114": {
+ "content": "<|reserved_special_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128115": {
+ "content": "<|reserved_special_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128116": {
+ "content": "<|reserved_special_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128117": {
+ "content": "<|reserved_special_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128118": {
+ "content": "<|reserved_special_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128119": {
+ "content": "<|reserved_special_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128120": {
+ "content": "<|reserved_special_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128121": {
+ "content": "<|reserved_special_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128122": {
+ "content": "<|reserved_special_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128123": {
+ "content": "<|reserved_special_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128124": {
+ "content": "<|reserved_special_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128125": {
+ "content": "<|reserved_special_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128126": {
+ "content": "<|reserved_special_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128127": {
+ "content": "<|reserved_special_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128128": {
+ "content": "<|reserved_special_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128129": {
+ "content": "<|reserved_special_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128130": {
+ "content": "<|reserved_special_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128131": {
+ "content": "<|reserved_special_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128132": {
+ "content": "<|reserved_special_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128133": {
+ "content": "<|reserved_special_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128134": {
+ "content": "<|reserved_special_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128135": {
+ "content": "<|reserved_special_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128136": {
+ "content": "<|reserved_special_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128137": {
+ "content": "<|reserved_special_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128138": {
+ "content": "<|reserved_special_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128139": {
+ "content": "<|reserved_special_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128140": {
+ "content": "<|reserved_special_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128141": {
+ "content": "<|reserved_special_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128142": {
+ "content": "<|reserved_special_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128143": {
+ "content": "<|reserved_special_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128144": {
+ "content": "<|reserved_special_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128145": {
+ "content": "<|reserved_special_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128146": {
+ "content": "<|reserved_special_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128147": {
+ "content": "<|reserved_special_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128148": {
+ "content": "<|reserved_special_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128149": {
+ "content": "<|reserved_special_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128150": {
+ "content": "<|reserved_special_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128151": {
+ "content": "<|reserved_special_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128152": {
+ "content": "<|reserved_special_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128153": {
+ "content": "<|reserved_special_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128154": {
+ "content": "<|reserved_special_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128155": {
+ "content": "<|reserved_special_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128156": {
+ "content": "<|reserved_special_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128157": {
+ "content": "<|reserved_special_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128158": {
+ "content": "<|reserved_special_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128159": {
+ "content": "<|reserved_special_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128160": {
+ "content": "<|reserved_special_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128161": {
+ "content": "<|reserved_special_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128162": {
+ "content": "<|reserved_special_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128163": {
+ "content": "<|reserved_special_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128164": {
+ "content": "<|reserved_special_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128165": {
+ "content": "<|reserved_special_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128166": {
+ "content": "<|reserved_special_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128167": {
+ "content": "<|reserved_special_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128168": {
+ "content": "<|reserved_special_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128169": {
+ "content": "<|reserved_special_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128170": {
+ "content": "<|reserved_special_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128171": {
+ "content": "<|reserved_special_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128172": {
+ "content": "<|reserved_special_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128173": {
+ "content": "<|reserved_special_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128174": {
+ "content": "<|reserved_special_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128175": {
+ "content": "<|reserved_special_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128176": {
+ "content": "<|reserved_special_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128177": {
+ "content": "<|reserved_special_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128178": {
+ "content": "<|reserved_special_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128179": {
+ "content": "<|reserved_special_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128180": {
+ "content": "<|reserved_special_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128181": {
+ "content": "<|reserved_special_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128182": {
+ "content": "<|reserved_special_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128183": {
+ "content": "<|reserved_special_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128184": {
+ "content": "<|reserved_special_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128185": {
+ "content": "<|reserved_special_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128186": {
+ "content": "<|reserved_special_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128187": {
+ "content": "<|reserved_special_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128188": {
+ "content": "<|reserved_special_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128189": {
+ "content": "<|reserved_special_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128190": {
+ "content": "<|reserved_special_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128191": {
+ "content": "<|reserved_special_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128192": {
+ "content": "<|reserved_special_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128193": {
+ "content": "<|reserved_special_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128194": {
+ "content": "<|reserved_special_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128195": {
+ "content": "<|reserved_special_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128196": {
+ "content": "<|reserved_special_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128197": {
+ "content": "<|reserved_special_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128198": {
+ "content": "<|reserved_special_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128199": {
+ "content": "<|reserved_special_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128200": {
+ "content": "<|reserved_special_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128201": {
+ "content": "<|reserved_special_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128202": {
+ "content": "<|reserved_special_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128203": {
+ "content": "<|reserved_special_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128204": {
+ "content": "<|reserved_special_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128205": {
+ "content": "<|reserved_special_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128206": {
+ "content": "<|reserved_special_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128207": {
+ "content": "<|reserved_special_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128208": {
+ "content": "<|reserved_special_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128209": {
+ "content": "<|reserved_special_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128210": {
+ "content": "<|reserved_special_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128211": {
+ "content": "<|reserved_special_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128212": {
+ "content": "<|reserved_special_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128213": {
+ "content": "<|reserved_special_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128214": {
+ "content": "<|reserved_special_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128215": {
+ "content": "<|reserved_special_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128216": {
+ "content": "<|reserved_special_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128217": {
+ "content": "<|reserved_special_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128218": {
+ "content": "<|reserved_special_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128219": {
+ "content": "<|reserved_special_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128220": {
+ "content": "<|reserved_special_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128221": {
+ "content": "<|reserved_special_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128222": {
+ "content": "<|reserved_special_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128223": {
+ "content": "<|reserved_special_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128224": {
+ "content": "<|reserved_special_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128225": {
+ "content": "<|reserved_special_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128226": {
+ "content": "<|reserved_special_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128227": {
+ "content": "<|reserved_special_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128228": {
+ "content": "<|reserved_special_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128229": {
+ "content": "<|reserved_special_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128230": {
+ "content": "<|reserved_special_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128231": {
+ "content": "<|reserved_special_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128232": {
+ "content": "<|reserved_special_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128233": {
+ "content": "<|reserved_special_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128234": {
+ "content": "<|reserved_special_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128235": {
+ "content": "<|reserved_special_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128236": {
+ "content": "<|reserved_special_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128237": {
+ "content": "<|reserved_special_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128238": {
+ "content": "<|reserved_special_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128239": {
+ "content": "<|reserved_special_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128240": {
+ "content": "<|reserved_special_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128241": {
+ "content": "<|reserved_special_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128242": {
+ "content": "<|reserved_special_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128243": {
+ "content": "<|reserved_special_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128244": {
+ "content": "<|reserved_special_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128245": {
+ "content": "<|reserved_special_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128246": {
+ "content": "<|reserved_special_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128247": {
+ "content": "<|reserved_special_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128248": {
+ "content": "<|reserved_special_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128249": {
+ "content": "<|reserved_special_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128250": {
+ "content": "<|reserved_special_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128251": {
+ "content": "<|reserved_special_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128252": {
+ "content": "<|reserved_special_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128253": {
+ "content": "<|reserved_special_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128254": {
+ "content": "<|reserved_special_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "128255": {
+ "content": "<|reserved_special_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|begin_of_text|>",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "clean_up_tokenization_spaces": true,
+ "eos_token": "<|eot_id|>",
+ "extra_special_tokens": {},
+ "model_input_names": [
+ "input_ids",
+ "attention_mask"
+ ],
+ "model_max_length": 131072,
+ "pad_token": "<|end_of_text|>",
+ "tokenizer_class": "PreTrainedTokenizer"
+}
diff --git a/checkpoint-72/trainer_state.json b/checkpoint-72/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ed6604a86f8c60058ede5db61e8a228f5bf08b0
--- /dev/null
+++ b/checkpoint-72/trainer_state.json
@@ -0,0 +1,537 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 0.9953917050691244,
+ "eval_steps": 500,
+ "global_step": 72,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.013824884792626729,
+ "grad_norm": 34.963134765625,
+ "learning_rate": 5.0000000000000004e-08,
+ "loss": 2.5476,
+ "step": 1
+ },
+ {
+ "epoch": 0.027649769585253458,
+ "grad_norm": 35.32600021362305,
+ "learning_rate": 1.0000000000000001e-07,
+ "loss": 2.6058,
+ "step": 2
+ },
+ {
+ "epoch": 0.041474654377880185,
+ "grad_norm": 34.955448150634766,
+ "learning_rate": 1.5000000000000002e-07,
+ "loss": 2.5871,
+ "step": 3
+ },
+ {
+ "epoch": 0.055299539170506916,
+ "grad_norm": 35.09806442260742,
+ "learning_rate": 2.0000000000000002e-07,
+ "loss": 2.5912,
+ "step": 4
+ },
+ {
+ "epoch": 0.06912442396313365,
+ "grad_norm": 34.88739776611328,
+ "learning_rate": 2.5000000000000004e-07,
+ "loss": 2.592,
+ "step": 5
+ },
+ {
+ "epoch": 0.08294930875576037,
+ "grad_norm": 34.84288024902344,
+ "learning_rate": 3.0000000000000004e-07,
+ "loss": 2.5609,
+ "step": 6
+ },
+ {
+ "epoch": 0.0967741935483871,
+ "grad_norm": 35.0090217590332,
+ "learning_rate": 3.5000000000000004e-07,
+ "loss": 2.5651,
+ "step": 7
+ },
+ {
+ "epoch": 0.11059907834101383,
+ "grad_norm": 35.03983688354492,
+ "learning_rate": 4.0000000000000003e-07,
+ "loss": 2.5437,
+ "step": 8
+ },
+ {
+ "epoch": 0.12442396313364056,
+ "grad_norm": 34.802833557128906,
+ "learning_rate": 4.5000000000000003e-07,
+ "loss": 2.5448,
+ "step": 9
+ },
+ {
+ "epoch": 0.1382488479262673,
+ "grad_norm": 34.5220947265625,
+ "learning_rate": 5.000000000000001e-07,
+ "loss": 2.504,
+ "step": 10
+ },
+ {
+ "epoch": 0.15207373271889402,
+ "grad_norm": 34.401580810546875,
+ "learning_rate": 5.5e-07,
+ "loss": 2.4814,
+ "step": 11
+ },
+ {
+ "epoch": 0.16589861751152074,
+ "grad_norm": 33.76997375488281,
+ "learning_rate": 6.000000000000001e-07,
+ "loss": 2.4282,
+ "step": 12
+ },
+ {
+ "epoch": 0.17972350230414746,
+ "grad_norm": 33.53415298461914,
+ "learning_rate": 6.5e-07,
+ "loss": 2.4216,
+ "step": 13
+ },
+ {
+ "epoch": 0.1935483870967742,
+ "grad_norm": 32.401580810546875,
+ "learning_rate": 7.000000000000001e-07,
+ "loss": 2.3362,
+ "step": 14
+ },
+ {
+ "epoch": 0.2073732718894009,
+ "grad_norm": 33.636661529541016,
+ "learning_rate": 7.5e-07,
+ "loss": 2.2978,
+ "step": 15
+ },
+ {
+ "epoch": 0.22119815668202766,
+ "grad_norm": 31.3782901763916,
+ "learning_rate": 8.000000000000001e-07,
+ "loss": 2.1358,
+ "step": 16
+ },
+ {
+ "epoch": 0.2350230414746544,
+ "grad_norm": 30.72391700744629,
+ "learning_rate": 8.500000000000001e-07,
+ "loss": 2.0652,
+ "step": 17
+ },
+ {
+ "epoch": 0.2488479262672811,
+ "grad_norm": 30.817584991455078,
+ "learning_rate": 9.000000000000001e-07,
+ "loss": 2.0115,
+ "step": 18
+ },
+ {
+ "epoch": 0.2626728110599078,
+ "grad_norm": 29.683996200561523,
+ "learning_rate": 9.500000000000001e-07,
+ "loss": 1.8668,
+ "step": 19
+ },
+ {
+ "epoch": 0.2764976958525346,
+ "grad_norm": 29.506683349609375,
+ "learning_rate": 1.0000000000000002e-06,
+ "loss": 1.7796,
+ "step": 20
+ },
+ {
+ "epoch": 0.2903225806451613,
+ "grad_norm": 27.55340003967285,
+ "learning_rate": 1.0500000000000001e-06,
+ "loss": 1.5656,
+ "step": 21
+ },
+ {
+ "epoch": 0.30414746543778803,
+ "grad_norm": 27.78036880493164,
+ "learning_rate": 1.1e-06,
+ "loss": 1.5112,
+ "step": 22
+ },
+ {
+ "epoch": 0.31797235023041476,
+ "grad_norm": 26.36115264892578,
+ "learning_rate": 1.1500000000000002e-06,
+ "loss": 1.3283,
+ "step": 23
+ },
+ {
+ "epoch": 0.3317972350230415,
+ "grad_norm": 25.388761520385742,
+ "learning_rate": 1.2000000000000002e-06,
+ "loss": 1.137,
+ "step": 24
+ },
+ {
+ "epoch": 0.3456221198156682,
+ "grad_norm": 25.21432876586914,
+ "learning_rate": 1.25e-06,
+ "loss": 0.9867,
+ "step": 25
+ },
+ {
+ "epoch": 0.35944700460829493,
+ "grad_norm": 24.924489974975586,
+ "learning_rate": 1.3e-06,
+ "loss": 0.7122,
+ "step": 26
+ },
+ {
+ "epoch": 0.37327188940092165,
+ "grad_norm": 21.881420135498047,
+ "learning_rate": 1.3500000000000002e-06,
+ "loss": 0.4952,
+ "step": 27
+ },
+ {
+ "epoch": 0.3870967741935484,
+ "grad_norm": 17.67154884338379,
+ "learning_rate": 1.4000000000000001e-06,
+ "loss": 0.3602,
+ "step": 28
+ },
+ {
+ "epoch": 0.4009216589861751,
+ "grad_norm": 11.489490509033203,
+ "learning_rate": 1.45e-06,
+ "loss": 0.2432,
+ "step": 29
+ },
+ {
+ "epoch": 0.4147465437788018,
+ "grad_norm": 7.622438907623291,
+ "learning_rate": 1.5e-06,
+ "loss": 0.189,
+ "step": 30
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 4.340638637542725,
+ "learning_rate": 1.5500000000000002e-06,
+ "loss": 0.1302,
+ "step": 31
+ },
+ {
+ "epoch": 0.4423963133640553,
+ "grad_norm": 3.079514980316162,
+ "learning_rate": 1.6000000000000001e-06,
+ "loss": 0.1075,
+ "step": 32
+ },
+ {
+ "epoch": 0.45622119815668205,
+ "grad_norm": 2.355943441390991,
+ "learning_rate": 1.6500000000000003e-06,
+ "loss": 0.0998,
+ "step": 33
+ },
+ {
+ "epoch": 0.4700460829493088,
+ "grad_norm": 1.9480725526809692,
+ "learning_rate": 1.7000000000000002e-06,
+ "loss": 0.0926,
+ "step": 34
+ },
+ {
+ "epoch": 0.4838709677419355,
+ "grad_norm": 1.8598166704177856,
+ "learning_rate": 1.75e-06,
+ "loss": 0.0733,
+ "step": 35
+ },
+ {
+ "epoch": 0.4976958525345622,
+ "grad_norm": 0.9892730712890625,
+ "learning_rate": 1.8000000000000001e-06,
+ "loss": 0.0664,
+ "step": 36
+ },
+ {
+ "epoch": 0.511520737327189,
+ "grad_norm": 0.8992418050765991,
+ "learning_rate": 1.85e-06,
+ "loss": 0.0709,
+ "step": 37
+ },
+ {
+ "epoch": 0.5253456221198156,
+ "grad_norm": 0.7340101599693298,
+ "learning_rate": 1.9000000000000002e-06,
+ "loss": 0.0535,
+ "step": 38
+ },
+ {
+ "epoch": 0.5391705069124424,
+ "grad_norm": 0.7032178044319153,
+ "learning_rate": 1.9500000000000004e-06,
+ "loss": 0.0573,
+ "step": 39
+ },
+ {
+ "epoch": 0.5529953917050692,
+ "grad_norm": 0.6449429392814636,
+ "learning_rate": 2.0000000000000003e-06,
+ "loss": 0.0576,
+ "step": 40
+ },
+ {
+ "epoch": 0.5668202764976958,
+ "grad_norm": 0.6358592510223389,
+ "learning_rate": 2.05e-06,
+ "loss": 0.0502,
+ "step": 41
+ },
+ {
+ "epoch": 0.5806451612903226,
+ "grad_norm": 0.572036623954773,
+ "learning_rate": 2.1000000000000002e-06,
+ "loss": 0.0556,
+ "step": 42
+ },
+ {
+ "epoch": 0.5944700460829493,
+ "grad_norm": 0.6538863778114319,
+ "learning_rate": 2.15e-06,
+ "loss": 0.0556,
+ "step": 43
+ },
+ {
+ "epoch": 0.6082949308755761,
+ "grad_norm": 0.3532159626483917,
+ "learning_rate": 2.2e-06,
+ "loss": 0.0452,
+ "step": 44
+ },
+ {
+ "epoch": 0.6221198156682027,
+ "grad_norm": 0.4853012263774872,
+ "learning_rate": 2.25e-06,
+ "loss": 0.0471,
+ "step": 45
+ },
+ {
+ "epoch": 0.6359447004608295,
+ "grad_norm": 0.4761648178100586,
+ "learning_rate": 2.3000000000000004e-06,
+ "loss": 0.0469,
+ "step": 46
+ },
+ {
+ "epoch": 0.6497695852534562,
+ "grad_norm": 0.6094638109207153,
+ "learning_rate": 2.35e-06,
+ "loss": 0.047,
+ "step": 47
+ },
+ {
+ "epoch": 0.663594470046083,
+ "grad_norm": 0.5211306214332581,
+ "learning_rate": 2.4000000000000003e-06,
+ "loss": 0.0402,
+ "step": 48
+ },
+ {
+ "epoch": 0.6774193548387096,
+ "grad_norm": 0.2997778356075287,
+ "learning_rate": 2.4500000000000003e-06,
+ "loss": 0.0425,
+ "step": 49
+ },
+ {
+ "epoch": 0.6912442396313364,
+ "grad_norm": 0.37834689021110535,
+ "learning_rate": 2.5e-06,
+ "loss": 0.0446,
+ "step": 50
+ },
+ {
+ "epoch": 0.7050691244239631,
+ "grad_norm": 0.31011995673179626,
+ "learning_rate": 2.55e-06,
+ "loss": 0.0406,
+ "step": 51
+ },
+ {
+ "epoch": 0.7188940092165899,
+ "grad_norm": 0.3113131523132324,
+ "learning_rate": 2.6e-06,
+ "loss": 0.0368,
+ "step": 52
+ },
+ {
+ "epoch": 0.7327188940092166,
+ "grad_norm": 0.5685846209526062,
+ "learning_rate": 2.6500000000000005e-06,
+ "loss": 0.0389,
+ "step": 53
+ },
+ {
+ "epoch": 0.7465437788018433,
+ "grad_norm": 0.29334983229637146,
+ "learning_rate": 2.7000000000000004e-06,
+ "loss": 0.0423,
+ "step": 54
+ },
+ {
+ "epoch": 0.7603686635944701,
+ "grad_norm": 0.5776861906051636,
+ "learning_rate": 2.7500000000000004e-06,
+ "loss": 0.0399,
+ "step": 55
+ },
+ {
+ "epoch": 0.7741935483870968,
+ "grad_norm": 0.35423165559768677,
+ "learning_rate": 2.8000000000000003e-06,
+ "loss": 0.0357,
+ "step": 56
+ },
+ {
+ "epoch": 0.7880184331797235,
+ "grad_norm": 0.37902742624282837,
+ "learning_rate": 2.85e-06,
+ "loss": 0.0407,
+ "step": 57
+ },
+ {
+ "epoch": 0.8018433179723502,
+ "grad_norm": 0.26948878169059753,
+ "learning_rate": 2.9e-06,
+ "loss": 0.0351,
+ "step": 58
+ },
+ {
+ "epoch": 0.815668202764977,
+ "grad_norm": 0.35688117146492004,
+ "learning_rate": 2.95e-06,
+ "loss": 0.0377,
+ "step": 59
+ },
+ {
+ "epoch": 0.8294930875576036,
+ "grad_norm": 0.5287911891937256,
+ "learning_rate": 3e-06,
+ "loss": 0.0377,
+ "step": 60
+ },
+ {
+ "epoch": 0.8433179723502304,
+ "grad_norm": 0.2950785756111145,
+ "learning_rate": 3.05e-06,
+ "loss": 0.0361,
+ "step": 61
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 0.2789723575115204,
+ "learning_rate": 3.1000000000000004e-06,
+ "loss": 0.032,
+ "step": 62
+ },
+ {
+ "epoch": 0.8709677419354839,
+ "grad_norm": 0.2802198529243469,
+ "learning_rate": 3.1500000000000003e-06,
+ "loss": 0.0394,
+ "step": 63
+ },
+ {
+ "epoch": 0.8847926267281107,
+ "grad_norm": 0.286981463432312,
+ "learning_rate": 3.2000000000000003e-06,
+ "loss": 0.033,
+ "step": 64
+ },
+ {
+ "epoch": 0.8986175115207373,
+ "grad_norm": 0.37392762303352356,
+ "learning_rate": 3.2500000000000002e-06,
+ "loss": 0.0335,
+ "step": 65
+ },
+ {
+ "epoch": 0.9124423963133641,
+ "grad_norm": 0.25025588274002075,
+ "learning_rate": 3.3000000000000006e-06,
+ "loss": 0.0311,
+ "step": 66
+ },
+ {
+ "epoch": 0.9262672811059908,
+ "grad_norm": 0.4292861521244049,
+ "learning_rate": 3.3500000000000005e-06,
+ "loss": 0.0362,
+ "step": 67
+ },
+ {
+ "epoch": 0.9400921658986175,
+ "grad_norm": 0.4717651307582855,
+ "learning_rate": 3.4000000000000005e-06,
+ "loss": 0.0303,
+ "step": 68
+ },
+ {
+ "epoch": 0.9539170506912442,
+ "grad_norm": 0.49291253089904785,
+ "learning_rate": 3.45e-06,
+ "loss": 0.0352,
+ "step": 69
+ },
+ {
+ "epoch": 0.967741935483871,
+ "grad_norm": 0.3729935586452484,
+ "learning_rate": 3.5e-06,
+ "loss": 0.0297,
+ "step": 70
+ },
+ {
+ "epoch": 0.9815668202764977,
+ "grad_norm": 0.27150583267211914,
+ "learning_rate": 3.5500000000000003e-06,
+ "loss": 0.0326,
+ "step": 71
+ },
+ {
+ "epoch": 0.9953917050691244,
+ "grad_norm": 0.34516096115112305,
+ "learning_rate": 3.6000000000000003e-06,
+ "loss": 0.0336,
+ "step": 72
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 432,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 6,
+ "save_steps": 72,
+ "stateful_callbacks": {
+ "TrainerControl": {
+ "args": {
+ "should_epoch_stop": false,
+ "should_evaluate": false,
+ "should_log": false,
+ "should_save": true,
+ "should_training_stop": false
+ },
+ "attributes": {}
+ }
+ },
+ "total_flos": 4.631891949769458e+18,
+ "train_batch_size": 4,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-72/training_args.bin b/checkpoint-72/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..48c70fd554062e31c1333fa196fcbd6a4f178c6c
--- /dev/null
+++ b/checkpoint-72/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1d334ff15891d240486e54641859af8de96cab43a64ef7bf9dc417387365ae5
+size 7928
diff --git a/checkpoint-72/zero_to_fp32.py b/checkpoint-72/zero_to_fp32.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8
--- /dev/null
+++ b/checkpoint-72/zero_to_fp32.py
@@ -0,0 +1,604 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+ buffers: dict()
+ param_shapes: dict()
+ shared_params: list
+ ds_version: int
+ frozen_param_shapes: dict()
+ frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+ return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+ '''
+ alist.sort(key=natural_keys) sorts in human order
+ http://nedbatchelder.com/blog/200712/human_sorting.html
+ (See Toothy's implementation in the comments)
+ '''
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+ if not os.path.isdir(checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+ # there should be only one file
+ if zero_stage <= 2:
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+ elif zero_stage == 3:
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+ if not os.path.exists(file):
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+ return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+ # XXX: need to test that this simple glob rule works for multi-node setup too
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+ if len(ckpt_files) == 0:
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+ return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+ zero_model_states = []
+ for file in files:
+ state_dict = torch.load(file, map_location=device)
+
+ if BUFFER_NAMES not in state_dict:
+ raise ValueError(f"{file} is not a model state checkpoint")
+ buffer_names = state_dict[BUFFER_NAMES]
+ if debug:
+ print("Found buffers:", buffer_names)
+
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+ param_shapes = state_dict[PARAM_SHAPES]
+
+ # collect parameters that are included in param_shapes
+ param_names = []
+ for s in param_shapes:
+ for name in s.keys():
+ param_names.append(name)
+
+ # update with frozen parameters
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+ if frozen_param_shapes is not None:
+ if debug:
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+ param_names += list(frozen_param_shapes.keys())
+
+ # handle shared params
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+ ds_version = state_dict.get(DS_VERSION, None)
+
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+ z_model_state = zero_model_state(buffers=buffers,
+ param_shapes=param_shapes,
+ shared_params=shared_params,
+ ds_version=ds_version,
+ frozen_param_shapes=frozen_param_shapes,
+ frozen_param_fragments=frozen_param_fragments)
+ zero_model_states.append(z_model_state)
+
+ return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+ total_files = len(files)
+ state_dicts = []
+ for f in files:
+ state_dict = torch.load(f, map_location=device)
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+ # and also handle the case where it was already removed by another helper script
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+ state_dicts.append(state_dict)
+
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
+ # use the max of the partition_count to get the dp world_size.
+
+ if type(world_size) is list:
+ world_size = max(world_size)
+
+ if world_size != total_files:
+ raise ValueError(
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+ )
+
+ # the groups are named differently in each stage
+ if zero_stage <= 2:
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+ elif zero_stage == 3:
+ fp32_groups_key = FP32_FLAT_GROUPS
+ else:
+ raise ValueError(f"unknown zero stage {zero_stage}")
+
+ if zero_stage <= 2:
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+ elif zero_stage == 3:
+ # if there is more than one param group, there will be multiple flattened tensors - one
+ # flattened tensor per group - for simplicity merge them into a single tensor
+ #
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+ fp32_flat_groups = [
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+ ]
+
+ return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+ """
+ Returns fp32 state_dict reconstructed from ds checkpoint
+
+ Args:
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+ """
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+ optim_files = get_optim_files(ds_checkpoint_dir)
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+ model_files = get_model_state_files(ds_checkpoint_dir)
+
+ zero_model_states = parse_model_states(model_files)
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+ if zero_stage <= 2:
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+ elif zero_stage == 3:
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+ if debug:
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ state_dict[name] = frozen_param_fragments[name]
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+ attr = getattr(obj, fn, None)
+ return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+
+ # Reconstruction protocol:
+ #
+ # XXX: document this
+
+ if debug:
+ for i in range(world_size):
+ for j in range(len(fp32_flat_groups[0])):
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+ # XXX: memory usage doubles here (zero2)
+ num_param_groups = len(fp32_flat_groups[0])
+ merged_single_partition_of_fp32_groups = []
+ for i in range(num_param_groups):
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+ avail_numel = sum(
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+ if debug:
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+ # not asserting if there is a mismatch due to possible padding
+ print(f"Have {avail_numel} numels to process.")
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ total_numel = 0
+ total_params = 0
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+ offset = 0
+ avail_numel = full_single_fp32_vector.numel()
+ for name, shape in shapes.items():
+
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ if debug:
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+ offset += unpartitioned_numel
+
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+ # live optimizer object, so we are checking that the numbers are within the right range
+ align_to = 2 * world_size
+
+ def zero2_align(x):
+ return align_to * math.ceil(x / align_to)
+
+ if debug:
+ print(f"original offset={offset}, avail_numel={avail_numel}")
+
+ offset = zero2_align(offset)
+ avail_numel = zero2_align(avail_numel)
+
+ if debug:
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+ remainder = unpartitioned_numel % world_size
+ padding_numel = (world_size - remainder) if remainder else 0
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+ return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+ return
+
+ if debug:
+ for i in range(world_size):
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+ wanted_params = len(frozen_param_shapes)
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+ print(f'Frozen params: Have {avail_numel} numels to process.')
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+ total_params = 0
+ total_numel = 0
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
+ total_params += 1
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+ param_shapes = zero_model_states[0].param_shapes
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+ # param, re-consolidating each param, while dealing with padding if any
+
+ # merge list of dicts, preserving order
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+ if debug:
+ for i in range(world_size):
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+ wanted_params = len(param_shapes)
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+ # not asserting if there is a mismatch due to possible padding
+ avail_numel = fp32_flat_groups[0].numel() * world_size
+ print(f"Trainable params: Have {avail_numel} numels to process.")
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+ # params
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+ # out-of-core computing solution
+ offset = 0
+ total_numel = 0
+ total_params = 0
+ for name, shape in param_shapes.items():
+
+ unpartitioned_numel = shape.numel()
+ total_numel += unpartitioned_numel
+ total_params += 1
+
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+ if debug:
+ print(
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+ )
+
+ # XXX: memory usage doubles here
+ state_dict[name] = torch.cat(
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
+ offset += partitioned_numel
+
+ offset *= world_size
+
+ # Sanity check
+ if offset != avail_numel:
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+ exclude_frozen_parameters):
+ state_dict = OrderedDict()
+
+ # buffers
+ buffers = zero_model_states[0].buffers
+ state_dict.update(buffers)
+ if debug:
+ print(f"added {len(buffers)} buffers")
+
+ if not exclude_frozen_parameters:
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+ # recover shared parameters
+ for pair in zero_model_states[0].shared_params:
+ if pair[1] in state_dict:
+ state_dict[pair[0]] = state_dict[pair[1]]
+
+ return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+ via a model hub.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+
+ Returns:
+ - pytorch ``state_dict``
+
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+ the checkpoint.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+ # do the training and checkpoint saving
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+ model = model.cpu() # move to cpu
+ model.load_state_dict(state_dict)
+ # submit to model hub or save the model to share with others
+
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
+ application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+ """
+ if tag is None:
+ latest_path = os.path.join(checkpoint_dir, 'latest')
+ if os.path.isfile(latest_path):
+ with open(latest_path, 'r') as fd:
+ tag = fd.read().strip()
+ else:
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+ if not os.path.isdir(ds_checkpoint_dir):
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
+ """
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+ Args:
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+ - ``exclude_frozen_parameters``: exclude frozen parameters
+ """
+
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
+ print(f"Saving fp32 state dict to {output_file}")
+ torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+ """
+ 1. Put the provided model to cpu
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+ 3. Load it into the provided model
+
+ Args:
+ - ``model``: the model object to update
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+ Returns:
+ - ``model`: modified model
+
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+ conveniently placed for you in the checkpoint folder.
+
+ A typical usage might be ::
+
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+ # submit to model hub or save the model to share with others
+
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+ """
+ logger.info(f"Extracting fp32 weights")
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+ logger.info(f"Overwriting model with fp32 weights")
+ model = model.cpu()
+ model.load_state_dict(state_dict, strict=False)
+
+ return model
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("checkpoint_dir",
+ type=str,
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+ parser.add_argument(
+ "output_file",
+ type=str,
+ help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+ parser.add_argument("-t",
+ "--tag",
+ type=str,
+ default=None,
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+ args = parser.parse_args()
+
+ debug = args.debug
+
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+ args.output_file,
+ tag=args.tag,
+ exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/config.json b/config.json
index 358245ff2f9aa0dab8c73b59c5e35464c3e7b467..dd4e5ad4ad4c9bafb071dccdbf016fd2b2c567d4 100644
--- a/config.json
+++ b/config.json
@@ -1,6 +1,6 @@
{
"_attn_implementation_autoset": true,
- "_name_or_path": "meta-llama/Llama-3.1-70B-Instruct",
+ "_name_or_path": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
"architectures": [
"LlamaForCausalLM"
],
diff --git a/tokenizer_config.json b/tokenizer_config.json
index ca91a2ef55f4239a7af81d7c9abb05f53621a07b..fdde94c29816839ec3c29d6c6461206a49911f3c 100644
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -2050,7 +2050,7 @@
}
},
"bos_token": "<|begin_of_text|>",
- "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+ "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
"clean_up_tokenization_spaces": true,
"eos_token": "<|eot_id|>",
"extra_special_tokens": {},